#include "data_management.cuh"
#include "hi_res_timer.h"
#include "thread_manager.h"
#include "main_cu.h"
#include "matches_manager.h"
#include "distance_matrix.h"

#include "needleman_wunsch_global_match_gpu.cuh"
#include "waterman_eggert_gpu.cuh"
#include "memory_kernels_gpu.cuh"
#include "neighbour_joining.h"

#include <math.h>

using std::string;
using namespace Exceptions;
using namespace Data;
using Data::Sequences;
using Data::MatchesManager;
using Exceptions::Exception;

ThreadManagerRunnable** ThreadManagerRunnable::runnables = NULL;

/*******************************************************************************
 * Initialize all the available runnables for user friendly interface.         *
 *******************************************************************************/

void ThreadManagerRunnable::init()
{
    runnables = new ThreadManagerRunnable*[4];
    runnables[0] = new PrimaryLibraryBuilder();
    runnables[1] = new ExtendedLibraryBuilder();
    runnables[2] = new MultipleSequenceAlignment();
    runnables[3] = NULL;
}

/*******************************************************************************
 * Searches for the algorithm by its name given in argument's "--alg", "-a"    *
 * or in configuration file "alg = ..."                                        *
 *******************************************************************************/

ThreadManagerRunnable* ThreadManagerRunnable::getAlgorithm(ArgumentsManager* am)
{
    if (!runnables)
        init();

    ThreadManagerRunnable** result = &(runnables[0]);

    while ((*result) && strcmp((*result)->getAlgorithmName(), am->getStringParam("alg", "a").c_str() ))
        result++;
    
    return (*result);
}

/*******************************************************************************
 * Method invoked by the ThreadManager must be a static two-parameter:         *
 *  @ pointer to a ThreadManager                                               *
 *  @ pointer to some given data                                               *
 * metod. This method satisfies those requirements and also allow to           *
 * structurise the algorithm and let just implement a few methods to optain    *
 * and forget about the whole ThreadManager structure.                         *
 *******************************************************************************/

void ThreadManagerRunnable::invoker(ThreadManager* tm, void* data)
{
    RunnableWithParams* iParams = (RunnableWithParams*)data;
    iParams->algorithm->actualInvokedMethod(iParams->params);
    delete iParams;
}

/*******************************************************************************
 * This method tries to load arguments specific to the given algorithm if it   *
 * fail then it will try to load default for the algorithm settings file and   *
 * try load them again.                                                        *
 *******************************************************************************/

void ThreadManagerRunnable::loadArguments(ArgumentsManager* am)
{
    try
    {
        actuallyArgumentsLoading(am);
    }
    catch (ParamDoesntExistException* ex)
    {
        am->openConfig(defaultSettingsFile());
        am->reloadArgs();

        try
        {
            actuallyArgumentsLoading(am);
        }
        catch (ParamDoesntExistException* ex)
        {
            printf("%s\n", ex->getMessage().c_str());
            throw;
        }
    }
}

/*******************************************************************************
 * AlignmentInvokerParams::getEstimatedCompexity() implements the              *
 * functionality of estimation of the complexity of the problem contained in   *
 * the, corresponding to the AlignmentInvokerParams window.                    *
 * Good approximation of the value is a product of the max length on X         *
 * position and on Y position.                                                 *
 *                                                                             *
 * Cpx = max[L(X)]*max[L(Y)]                                                   *
 *                                                                             *
 * Where:                                                                      *
 * L(n) is the length of sequence n                                            *
 *                                                                             *
 *******************************************************************************/

long long PLWindowParams::getEstimatedComplexity()
{
    long long result;
    result = (long long) seqs->getWindowSum(windowSize, windowX, blockShape);
    result *= (long long) seqs->getWindowSum(windowSize, windowY, blockShape);

    //        if (windowX == windowY)
    //        {
    //            result = (result + seqs->getSquaredSum(windowSize, windowX))/2;
    //        }

    int multiprocessorCorrection;


    if (partId == 1)
    {
        multiprocessorCorrection = seqs->getSequenceNumber() % windowSize;
        multiprocessorCorrection += blockShape - 1;
        multiprocessorCorrection /= blockShape;
        multiprocessorCorrection *= multiprocessorCorrection;

        if ((multiprocessorCorrection < maxMultiprocessorCount) && (multiprocessorCorrection != 0))
        {
            result *= maxMultiprocessorCount;
            result /= multiprocessorCorrection;
        }
    }
    //        else if (seqs->getSequenceNumber() / windowSize == windowY)
    //        {
    //            multiprocessorCorrection = seqs->getSequenceNumber() % windowSize;
    //            multiprocessorCorrection += blockShape - 1;
    //            multiprocessorCorrection /= blockShape;
    //            multiprocessorCorrection *= windowSize / blockShape;
    //            if (multiprocessorCorrection < maxMultiprocessorCount)
    //            {
    //                result *= maxMultiprocessorCount;
    //                result /= multiprocessorCorrection;
    //            }
    //        }

    //        multiprocessorCorrection = seqs->getSequenceNumber();
    //        multiprocessorCorrection += windowSize - 1;
    //        multiprocessorCorrection /= windowSize;
    //        multiprocessorCorrection *= multiprocessorCorrection;
    //        multiprocessorCorrection /= maxMultiprocessorCount;

    result = MAX(result, seqs->getWindowMax(windowSize, windowX) * seqs->getWindowMax(windowSize, windowY) * maxMultiprocessorCount); //16 multiprocessors count

    return result;
}

/*******************************************************************************
 * compareAlignmentScoreKernelInvokerParams compares this datatype by the      *
 * estimated complexity.                                                       *
 *******************************************************************************/

int PLWindowParams::compareAlignmentScoreKernelParams(const void* first, const void* second)
{
    PLWindowParams* sFirst = *((PLWindowParams**) first);
    PLWindowParams* sSecond = *((PLWindowParams**) second);
    if (sFirst->getEstimatedComplexity() < sSecond->getEstimatedComplexity())
        return 1;
    if (sFirst->getEstimatedComplexity() == sSecond->getEstimatedComplexity())
        return 0;
    return -1;
}

PrimaryLibraryBuilder::PrimaryLibraryBuilder()
{
    this->library = NULL;
    this->dm = NULL;
}

PrimaryLibraryBuilder::~PrimaryLibraryBuilder()
{
    if(this->dm)
        delete this->dm;
    if(this->library)
        delete this->library;
}

void PrimaryLibraryBuilder::actuallyArgumentsLoading(ArgumentsManager* am)
{
    this->K                      = am->getIntParam("laligns", "K" );
    this->gapExt                 = am->getIntParam("gapext" , "ge");
    this->gapOpen                = am->getIntParam("gapopen", "go");
    this->maxMultiprocessorCount = am->getIntParam("maxDevMultiprocessor", "MPs");
    this->windowSize             = am->getIntParam("winsize", "ws");


    if(am->containParam("wecutoff", "wecut"))
        this->wecutoff           = am->getIntParam("wecutoff", "wecut");
    else
        this->wecutoff           = 0; //no cutoffs

    if(am->containParam("nwcutoff", "nwcut"))
        this->nwcutoff           = am->getIntParam("nwcutoff", "nwcut");
    else
        this->nwcutoff           = INT_MIN; //no cutoffs

    if(am->containParam("entireMatrix", "em"))
        this->entireMatrix       = am->getBoolParam("entireMatrix", "em");
    else
        this->entireMatrix       = false;

    if(am->containParam("ploutfile", "plo"))
        this->ploutfile          = am->getParam("ploutfile", "plo");
    else
        this->ploutfile          = NULL;

    if(am->containParam("njoutfile", "njo"))
        this->njoutfile          = am->getParam("njoutfile", "njo");
    else
        this->njoutfile          = NULL;

    if(am->containParam("computeNJ", "nj"))
        this->entireMatrix       = am->getBoolParam("computeNJ", "nj");
    else
        this->entireMatrix       = false;
    

    string smFile                = am->getStringParam("submatrix", "sm");
    string inFile                = am->getStringParam("infile"   , "i" );


    this->sm                     = new SubstitutionMatrix(smFile.c_str());
    this->seqs                   = new Sequences(inFile.c_str(),this->sm);

    this->seqs->load();
    this->seqs->sortSequences();
    //printf("%s\n%s\n%s\n", this->seqs->getSeqName(0), this->seqs->getSeqName(1), this->seqs->getSeqName(this->seqs->getSequenceNumber() - 1));
    
}



/*******************************************************************************
 * Returns path to the file with default settings for the algorithm            *
 *******************************************************************************/

char* PrimaryLibraryBuilder::defaultSettingsFile()
{
    return "etc/plb-defaults.cnf";
}

void PrimaryLibraryBuilder::run(ThreadManager* tm)
{
    //matches1Manager = new MatchesManager(seqs->getSequenceNumber(), windowSize, ((seqs->getMaxSeqLen() - 1) / 4 + 1) * 4 * 2);
    //matches2Manager = new MatchesManager(seqs->getSequenceNumber(), windowSize, ((seqs->getMaxSeqLen() - 1) / 4 + 1) * 4 * 2);

    //HERE TASKS ARE DEFINED


    HiResTimer timer;
    timer.start();

    int gpus = tm->getThreadsNumber();
    smParams.sm = seqs->getSubtitutionMatrix();
    smParams.gapOpen = gapOpen;
    smParams.gapExtension = gapExt;
    smParams.windowSize = windowSize;
    smParams.memoryOffset = windowSize * windowSize;

    for (int i = 0; i < gpus; i++)
    {
        tm->request(copySMToConstInThread, (void*) & smParams, i);
    }

    int sequenceNumber = seqs->getSequenceNumber();
    //[windowsNumber*(windowsNumber + 1)] / 2 - the number of data parts to be processed
    int windowsNumber = (sequenceNumber - 1) / windowSize + 1;

    this->dm = new DistanceMatrix(seqs);

    int partId = 1;
    int parts = windowsNumber * (windowsNumber + 1) / 2;
    PLWindowParams* params;

    PLWindowParams** jobs = new PLWindowParams*[parts];


    //long jobs first and short jobs last is better for load balancing
    for (int j = windowsNumber - 1; j >= 0; j--) //we iterate through all the windows
    {
        for (int i = j; i >= 0; i--)
        {
            params = new PLWindowParams();
            params->blockShape = ALIGNMENT_MATCH_BLOCK_SHAPE;
            params->windowX = i;
            params->windowY = j;
            params->partId = partId++;
            params->parts = parts;
            params->seqs = seqs;
            params->maxMultiprocessorCount = maxMultiprocessorCount;
            params->windowSize = windowSize;

            jobs[params->partId - 1] = params;
            //tm->request(NeedlemanWunschGlobalScoreKernelInvoker, (void*)params, -1);
        }
    }

    qsort(jobs, parts, sizeof (PLWindowParams*), PLWindowParams::compareAlignmentScoreKernelParams);

    for (int i = 0; i < parts; i++)
    {
        //printf("%d: %lld %d %d\n", jobs[i]->partId, jobs[i]->getEstimatedComplexity(), jobs[i]->seqs->getWindowSum(jobs[i]->windowSize, jobs[i]->windowX), jobs[i]->seqs->getWindowSum(jobs[i]->windowSize, jobs[i]->windowY));//
        RunnableWithParams* invokingParams = new RunnableWithParams();
        invokingParams->algorithm = this;
        invokingParams->params = jobs[i];
        tm->request(invoker, (void*)invokingParams, -1);
    }

    tm->wait();


    PartialPrimaryLibrary** PPLs = new PartialPrimaryLibrary*[windowsNumber*windowsNumber];    
    for (int i = 0; i < parts; i++)
    {
        PPLs[jobs[i]->windowY*windowsNumber + jobs[i]->windowX] =
                jobs[i]->pplX;
        
        if(jobs[i]->windowX != jobs[i]->windowY)
        {
            PPLs[jobs[i]->windowX*windowsNumber + jobs[i]->windowY] =
                jobs[i]->pplY;
        }
    }

    for (int i = 0; i < parts; i++)
        delete jobs[i];
    delete jobs;


    if(this->library == NULL)
        this->library = new PrimaryLibrary();
    HiResTimer timer1;
    timer1.start();
    this->library->createPL(PPLs, windowsNumber, sequenceNumber);
    timer1.stop();
    printf("PL transcription time: %dms\n", (int) timer1.getElapsedTime());

//    int xx = 3;
//    int yy = 4;
////    for(int i=0; i<(sequenceNumber*sequenceNumber+1); i++)
////        printf("i=%d: %d\n", i, this->pl->starts[i+1] - this->pl->starts[i]);
////
//    for(int k=this->library->starts[yy*sequenceNumber + xx]; k<this->library->starts[yy*sequenceNumber + xx + 1]; k++)
//    {
//        printf("x: %3d ",  this->library->pl[k] >> 20);
//        printf("y: %3d ", (this->library->pl[k] >> 8)&0xFFF);
//        printf("i: %3d\n", this->library->pl[k]&0xFF);
//    }
////    printf("%f\n", this->dm->getElement(xx,yy));
    


    timer.stop();
    printf("%s total: %dms\n", getAlgorithmName(), (int) timer.getElapsedTime());

    
    
    // Saving primary library to a file using T-Coffee file format.
    if(this->ploutfile != NULL)
    {
        timer.start();
        
        this->library->saveLibraryToFile(this->ploutfile, NULL, this->seqs, this->K);
        
        timer.stop();
        printf("Saving library to file %s: %dms\n", this->ploutfile, (int)timer.getElapsedTime());
    }

    //this->dm->saveToFile("results/dm.txt");

    // NJ
    if(computeNJ)
    {
        try
        {
            //dm->saveToPhylipFile("tmp.phylip");
            //if(this->njoutfile == NULL)
            //    this->njoutfile = "tmp.dnd";
            //system((string("quicktree -in m tmp.phylip > ") + this->njoutfile).c_str());

            DistanceMatrix* dmNJ = new DistanceMatrix(this->seqs);
            dmNJ->copyDistanceMatrix(dm);

            nj = new NeighbourJoining(this->njoutfile, dmNJ, this->seqs);
            nj->run();
            //nj = new NeighbourJoining(this->njoutfile, dmNJ, this->seqs);
            //nj->constructTreeFromPhylipFile();

            delete dmNJ;
        }
        catch(IncorrectInputData* ex)
        {
            printf("%s\n",ex->getMessage());
        }
        
    }



    //debug - writing results to a file
//    FILE* file = fopen(outfile, "w");
//    for (int i = 0; i < sequenceNumber; i++)//Y
//    {
//        for (int j = 0; j <= i; j++)//X
//        {
//            //fprintf(file, "%5d", scores[i][j]);
//        }
//        fprintf(file, "\n");
//    }


}


const char* PrimaryLibraryBuilder::getAlgorithmName()
{
    return "plb";
}


void saveMalloc(unsigned int size)
{
//    unsigned int free;
//    unsigned int total;
//
//    cuMemGetInfo(&free, &total);
//    printf("%u %u\n", free, size);
}

void PrimaryLibraryBuilder::actualInvokedMethod(void* voidParams)
{
    //THIS FUNCTION IS CALLED BY THREAD MANAGER

/*******************************************************************************
* COMMON DECLARATIONS
********************************************************************************/

    PLWindowParams* params = (PLWindowParams*)voidParams;

    params->startSeqs1No = params->windowX * params->windowSize;
    params->startSeqs2No = params->windowY * params->windowSize;
    try
    {
        int maxSeqLength = params->seqs->getMaxSeqLen();

        //one element in AF matrix:
        // - 2 bytes for element of A matrix
        // - 2 bytes for element of F matrix
        short2* AF;
        saveMalloc(sizeof(int) * maxSeqLength * params->windowSize * params->windowSize);
        cudaMalloc(&AF, sizeof(int) * maxSeqLength * params->windowSize * params->windowSize);
        int* scoresDevPtr;

        saveMalloc(sizeof(int) * params->windowSize * params->windowSize);
        cudaMalloc(&scoresDevPtr, sizeof(int) * params->windowSize * params->windowSize);

        //sizoeof(back) => sizeof(int) * maxSeqLength * (maxSeqLength/8) * windowSize * windowSize
        //(height+8) * (maxSeqLength+1) -> additional -1 row and -1 column in "back" array
        int height = ((maxSeqLength-1) / 8 + 1) * 8; //8->8, 9->16, 10->16 ...
        //hight of this array must be dividable by 8 (ALIGNMENT_MATCH_Y_STEPS)
        unsigned int* back;
        unsigned int  backSize = sizeof(unsigned int) * (height+8) * (maxSeqLength+1) * params->windowSize * (params->windowSize/ALIGNMENT_MATCH_Y_STEPS);
        saveMalloc(backSize);
        cudaMalloc(&back, backSize);
        //texture can't be too big -> since CUDA 3.0 the line below causes errors
        //cudaBindTexture(0, texBack, back, backSize);

        //memory for temporary (intermediate) results (alignments/matches)
        //we need: maxSeqLength*2*windowSize*windowSize
        unsigned int* prePPLDevPtr;
        saveMalloc(sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize);
        cudaMalloc(&prePPLDevPtr, sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize);

        //memory for final results (alignments/matches)
        unsigned int* outPrePPLDevPtr;
        saveMalloc(sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize);
        cudaMalloc(&outPrePPLDevPtr, sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize);

        //memory for identity matrix
        short2* identityMatrixDevPtr;
        saveMalloc(sizeof(short2) * params->windowSize * params->windowSize);
        cudaMalloc(&identityMatrixDevPtr, sizeof(short2) * params->windowSize * params->windowSize);

        //copying input sequences to texture memory
        TexVariablesAddresses addr = copySeqsToTex(params->seqs, params->startSeqs1No, params->startSeqs2No, params->windowSize);

        //times
        HiResTimer timer;
        int timeNW;//global alignment
        int timeNWreorder;//
        int timeWE = 0;//Watermann Eggert
        int timeWEreorder = 0;
        int timeSM;//Sort and Merge
        int timePrePPLtoPPL;//PrePPL to PPL

/*******************************************************************************
* ALIGNMENT WITH BACKTRACKING
********************************************************************************/

        /***********************************************************************
         * KERNEL 1                                                            *
         * score calculation and "back" matrix filling                         *
         ***********************************************************************/

        dim3 blockShape(params->blockShape,params->blockShape);
        dim3 gridShape((params->windowSize-1)/params->blockShape + 1,(params->windowSize-1)/params->blockShape +1);



        timer.start();
        //rowWidth == maxSeqLength+1 => +1 because we have to take into account the -1 column
        NeedlemanWunschGlobalMatchKernel<<<gridShape,blockShape>>>(AF, back, scoresDevPtr, maxSeqLength+1, (params->windowX == params->windowY));
        cudaThreadSynchronize();
        timer.stop();
        timeNW = (int)timer.getElapsedTime();

        /***********************************************************************
         * KERNEL 2                                                            *
         * backtracing - alignment matches generation                          *
         ***********************************************************************/

        //short2 x;
        //cudaMemcpy(&x, AF, sizeof(short2), cudaMemcpyDeviceToHost);
        //printf("xxx%d %d %d %d\n", x.x, params->seqs->getLengths()[params->windowX*params->windowSize], x.y, params->seqs->getLengths()[params->windowY*params->windowSize]);


        timer.start();
        NeedlemanWunschGlobalBackKernel<<<gridShape,blockShape>>>(back, maxSeqLength+1, prePPLDevPtr, identityMatrixDevPtr, scoresDevPtr, nwcutoff, (params->windowX == params->windowY));
        cudaThreadSynchronize();
        timer.stop();
        timeNW += (int)timer.getElapsedTime();

        //printf("Kernel[%5d] %5dms %5dms\n", params->partId, (int)timer1.getElapsedTime(), (int)timer2.getElapsedTime());

        // reading identity values
        short2* identityMatrixHostPtr = new short2[params->windowSize * params->windowSize];
        cudaMemcpy(identityMatrixHostPtr, identityMatrixDevPtr, sizeof(short2) * params->windowSize * params->windowSize, cudaMemcpyDeviceToHost);
        dm->initDistanceMatrixWindow(params->windowX, params->windowY, params->windowSize, identityMatrixHostPtr);
        delete[] identityMatrixHostPtr;
        cudaFree(identityMatrixDevPtr);

        /***********************************************************************
         * KERNER 3                                                            *
         * changing order of the results in GPU memory                         *
         ***********************************************************************/

        dim3 blockShape2(params->blockShape, params->blockShape);
        dim3 gridShape2( (params->windowSize*params->windowSize) / params->blockShape);

        /***********************************************************************
         * maxSeqLength*2                                                      *
         *    -> *2 because alignment can be 2x as long as the longest         *
         *      sequence                                                       *
         ***********************************************************************/

        timer.start();
        reorderPrePPLsInMemory<<<gridShape2, blockShape2>>>(prePPLDevPtr, outPrePPLDevPtr, maxSeqLength*2);
        cudaThreadSynchronize();
        timer.stop();
        timeNWreorder = (int)timer.getElapsedTime();

        /***********************************************************************
         * READINNG RESULTS                                                    *
         ***********************************************************************/

        //size of outPrePPLHostPtr is:
        // (1 + K) * (maxSeqLength * 2 * WIN_SIZE * WIN_SIZE)
        unsigned int* outPrePPLHostPtr = new unsigned int[maxSeqLength * 2 * params->windowSize * params->windowSize * (1+K)];
        cudaMemcpy(outPrePPLHostPtr, outPrePPLDevPtr, sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize, cudaMemcpyDeviceToHost);

//        if((params->windowX == 0) && (params->windowY == 1))
//        {
//            unsigned int* element = outPrePPLHostPtr+(0*params->windowSize+5)*maxSeqLength*2;
//            while((*element) != 0xFFFFFFFF)
//            {
//                printf("x: %3d ", (*element) >> 20);
//                printf("y: %3d ", ((*element) >> 8)&0xFFF);
//                printf("i: %3d\n", (*element)&0xFF);
//                element++;
//            }
//
//        }
        

/*******************************************************************************
* ALIGNMENT K BESTS (WATERMAN-EGGERT)
********************************************************************************/

        //0-elements memory for Waterman-Eggert
        int width = ((maxSeqLength-1) / 4 + 1) * 4; //4->4, 5->8
        //eggertSize == number of bytes for the array
        unsigned int eggertSize = (height * width)/8 * params->windowSize * params->windowSize;
        unsigned int* eggertDevPtr;
        saveMalloc(eggertSize);
        cudaMalloc(&eggertDevPtr, eggertSize);

        //host memory allocation for results


        /***********************************************************************
         * KERNEL 0                                                            *
         * initializing eggert array                                           *
         ***********************************************************************/

        timer.start();
        initEggertArray<<<gridShape,blockShape>>>(eggertDevPtr, eggertSize);
        cudaThreadSynchronize();
        timer.stop();
        timeWE = (int)timer.getElapsedTime();



        //START OF K-times for
        for(int k = 0; k<K; k++)
        {
            /*******************************************************************
             * KERNEL 1                                                        *
             * score calculation and "back" matrix fill                        *
             *******************************************************************/

            timer.start();
            //rowWidth == maxSeqLength+1 => +1 because we have to take into account the -1 column
            WatermanEggertKernel<<<gridShape,blockShape>>>(AF, back, eggertDevPtr, scoresDevPtr, maxSeqLength+1, (params->windowX == params->windowY));
            cudaThreadSynchronize();
            timer.stop();
            timeWE += (int)timer.getElapsedTime();

            /*******************************************************************
             * KERNER 2                                                        *
             * backtracing - alignment matches generation                      *
             *******************************************************************/

            timer.start();
            WatermanEggertBackKernel<<<gridShape,blockShape>>>(AF, back, eggertDevPtr, scoresDevPtr, maxSeqLength+1, prePPLDevPtr, wecutoff, (params->windowX == params->windowY));
            cudaThreadSynchronize();
            timer.stop();
            timeWE += (int)timer.getElapsedTime();

            //printf("Kernel[%5d] %5dms %5dms\n", params->partId, (int)timer1.getElapsedTime(), (int)timer2.getElapsedTime());


            /*******************************************************************
             * KERNEL 3                                                        *
             * changing order of the results in GPU memory                     *
             *-----------------------------------------------------------------*
             * maxSeqLength*2                                                  *
             *    -> *2 because alignment can be 2x as long as the longest     *
             *      sequence                                                   *
             *******************************************************************/
            timer.start();
            reorderPrePPLsInMemory<<<gridShape2, blockShape2>>>(prePPLDevPtr, outPrePPLDevPtr, maxSeqLength*2);
            cudaThreadSynchronize();
            timer.stop();
            timeWEreorder += (int)timer.getElapsedTime();

            /*******************************************************************
             * READINNG RESULTS                                                *
             *******************************************************************/

            unsigned int* currentOut = outPrePPLHostPtr+(maxSeqLength * 2 * params->windowSize * params->windowSize * (k+1));
            cudaMemcpy(currentOut, outPrePPLDevPtr, sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize, cudaMemcpyDeviceToHost);

//            printf("k: %3d \n\n", k);
//
//            if((params->windowX == 0) && (params->windowY == 1))
//            {
//                unsigned int* element = currentOut+(0*params->windowSize+5)*maxSeqLength*2;
//                while((*element) != 0xFFFFFFFF)
//                {
//                    printf("x: %3d ", (*element) >> 20);
//                    printf("y: %3d ", ((*element) >> 8)&0xFFF);
//                    printf("i: %3d\n", (*element)&0xFF);
//                    element++;
//                }
//
//            }


        }//END OF THE K-TIMES LOOP
        
        
        
/*******************************************************************************
* CLEANING UP GPU RAM BEFORE BUILDING PPL                                      *
********************************************************************************/

        //dealocating memory on GPU
        cudaFree(AF);
        cudaFree(back);
        cudaFree(eggertDevPtr);
        cudaFree(prePPLDevPtr);
        cudaFree(outPrePPLDevPtr);
        cudaFree(scoresDevPtr);
        cudaFree(addr.texSeqs1DevPtr);
        cudaFree(addr.texSeqs2DevPtr);

/*******************************************************************************
* MERGING & SORTING RESULTS (BUILDING PPL)                                     *
********************************************************************************/

        //If (params->windowX == params->windowY) then
        //we build only one PPL.
        //Otherwise we build two PPLs.

          
        //memory for K+1 prePPLs (alignments) before merging and sorting
        unsigned int* prePPLsDevPtr;
        saveMalloc(sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize * (K+1));
        cudaMalloc(&prePPLsDevPtr, sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize * (K+1)); 
        cudaMemcpy(prePPLsDevPtr, outPrePPLHostPtr, sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize * (K+1), cudaMemcpyHostToDevice );

        //memory for K+1 prePPLs (alignments) after merging and sorting
        unsigned int* outPrePPLsDevPtr;
        saveMalloc(sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize * (K+1));
        cudaMalloc(&outPrePPLsDevPtr, sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize * (K+1));

        //memory for array with merged alignments (PL entries) lengths
        unsigned int* lengthsDevPtr;
        unsigned int* lengthsHostPtr = new unsigned int[params->windowSize * params->windowSize];
        saveMalloc(sizeof(int) * (params->windowSize * params->windowSize + 1));
        cudaMalloc(&lengthsDevPtr, sizeof(int) * (params->windowSize * params->windowSize + 1)); //+1 because it will be used then to store starts
        


       /************************************************************************
        * MERGING AND SORTING vol.1 (BELOW DIAGONAL)                           *
        ***********************************************************************/
        gridShape.y = params->windowSize;
        timer.start();
        // Sorted by Y coordinate (i.e. by second sequence).
        // For example:
        // if we have alignment of sequences (1,2)
        // then it is sorted by second index (here by 2 sequence).
        // Alignments are always sorted by second indes
        // regardless of their position (above of below diagonal).
        sortAndMergePrePPL<<<gridShape, blockShape>>>(prePPLsDevPtr, outPrePPLsDevPtr, maxSeqLength * 2, this->K, params->windowX, params->windowY, lengthsDevPtr, seqs->getSequenceNumber());
        cudaThreadSynchronize();
        timer.stop();
        timeSM = (int)timer.getElapsedTime();
      
        //cudaMemcpy(outPrePPLHostPtr, outPrePPLsDevPtr, sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize * (1+K), cudaMemcpyDeviceToHost);
        cudaMemcpy(lengthsHostPtr, lengthsDevPtr, sizeof(int) * params->windowSize * params->windowSize, cudaMemcpyDeviceToHost);

        //PPL
        params->pplX = new PartialPrimaryLibrary();
        params->pplX->allocateStarts(params->windowSize);
        params->pplX->starts[0] = 0;
        for(int i=0; i<(params->windowSize*params->windowSize); i++)
            params->pplX->starts[i+1] = params->pplX->starts[i] + lengthsHostPtr[i];

        //copying starts to GPU
        cudaMemcpy(lengthsDevPtr, params->pplX->starts, sizeof(int) * (params->windowSize * params->windowSize + 1), cudaMemcpyHostToDevice);

        unsigned int PPLelemCount = params->pplX->starts[params->windowSize*params->windowSize];
        params->pplX->allocatePPL(PPLelemCount);

        unsigned int* outPPLDevPtr;
        saveMalloc(sizeof(int) * PPLelemCount);
        cudaMalloc(&outPPLDevPtr, sizeof(int) * PPLelemCount);

        timer.start();
        PrePPLtoPPL<<<gridShape, blockShape>>>(outPrePPLsDevPtr, outPPLDevPtr, lengthsDevPtr, maxSeqLength * 2, this->K);
        cudaThreadSynchronize();
        timer.stop();
        timePrePPLtoPPL = (int)timer.getElapsedTime();
        cudaMemcpy(params->pplX->ppl, outPPLDevPtr, sizeof(int) * PPLelemCount, cudaMemcpyDeviceToHost);
        //printf("%fMB\t%fMB\n", (sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize * (K+1))/(1024.0*1024), (sizeof(int) * PPLelemCount)/(1024.0*1024));

//        // alignment printing
//        for(int i=params->pplX->starts[params->windowSize]; i<params->pplX->starts[params->windowSize+1]; i++)
//        {
//            printf("x: %3d ", params->pplX->ppl[i] >> 20);
//            printf("y: %3d ", (params->pplX->ppl[i] >> 8)&0xFFF);
//            printf("i: %3d\n", params->pplX->ppl[i]&0xFF);
//        }
        
//        //TMP:
//        printf("--------------\n\n");
//        if((params->windowX == 0) && (params->windowY == 1))
//        {
//            int ind = 0;
//            unsigned int* element = outPrePPLHostPtr+(0*params->windowSize+5)*maxSeqLength*2*(K+1);
//            printf("Length: %d\n", lengthsHostPtr[0*params->windowSize+5]);
//            //unsigned int* element = outPrePPLHostPtr+2*maxSeqLength*2*(K+1);
//            while(((*element) != 0xFFFFFFFF))
//            {
//                printf("x: %3d ", (*element) >> 20);
//                printf("y: %3d ", ((*element) >> 8)&0xFFF);
//                printf("i: %3d\n", (*element)&0xFF);
//                element++;
//                ind++;
//            }
//
//        }


       /************************************************************************
        * MERGING AND SORTING vol.2 (ABOVE DIAGONAL)                           *
        ***********************************************************************/
        if((params->windowX != params->windowY) && (this->entireMatrix))
        {
            timer.start();
            // Sorted by Y coordinate (i.e. by second sequence).
            // For example:
            // if we have alignment of sequences (1,2)
            // then it is sorted by second index (here by 2 sequence).
            // Alignments are always sorted by second indes
            // regardless of their position (above of below diagonal).
            sortAndMergePrePPL<<<gridShape, blockShape>>>(prePPLsDevPtr, outPrePPLsDevPtr, maxSeqLength * 2, this->K, params->windowY, params->windowX, lengthsDevPtr, seqs->getSequenceNumber());
            cudaThreadSynchronize();
            timer.stop();
            timeSM += (int)timer.getElapsedTime();

            //cudaMemcpy(outPrePPLHostPtr, outPrePPLsDevPtr, sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize * (1+K), cudaMemcpyDeviceToHost);
            cudaMemcpy(lengthsHostPtr, lengthsDevPtr, sizeof(int) * params->windowSize * params->windowSize, cudaMemcpyDeviceToHost);


            //PPL
            params->pplY = new PartialPrimaryLibrary();
            params->pplY->allocateStarts(params->windowSize);
            params->pplY->starts[0] = 0;
            for(int i=0; i<(params->windowSize*params->windowSize); i++)
                params->pplY->starts[i+1] = params->pplY->starts[i] + lengthsHostPtr[i];

            //copying starts to GPU
            cudaMemcpy(lengthsDevPtr, params->pplY->starts, sizeof(int) * (params->windowSize * params->windowSize + 1), cudaMemcpyHostToDevice);

            unsigned int PPLelemCount2 = params->pplY->starts[params->windowSize*params->windowSize];
            params->pplY->allocatePPL(PPLelemCount2);

            timer.start();
            PrePPLtoPPL<<<gridShape, blockShape>>>(outPrePPLsDevPtr, outPPLDevPtr, lengthsDevPtr, maxSeqLength * 2, this->K);
            cudaThreadSynchronize();
            timer.stop();
            timePrePPLtoPPL += (int)timer.getElapsedTime();
            cudaMemcpy(params->pplY->ppl, outPPLDevPtr, sizeof(int) * PPLelemCount2, cudaMemcpyDeviceToHost);
            //printf("%fMB\t%fMB\n", (sizeof(int) * maxSeqLength * 2 * params->windowSize * params->windowSize * (K+1))/(1024.0*1024), (sizeof(int) * PPLelemCount)/(1024.0*1024));

//            // alignment printing
//            if((params->windowX == 0) && (params->windowY == 1))
//            {
//                for(int i=params->pplY->starts[4]; i<params->pplY->starts[5]; i++)
//                {
//                    printf("x: %3d ", params->pplY->ppl[i] >> 20);
//                    printf("y: %3d ", (params->pplY->ppl[i] >> 8)&0xFFF);
//                    printf("i: %3d\n", params->pplY->ppl[i]&0xFF);
//                }
//            }

            //to ponizej jest do testow i ostatecznie
            //moze byc wywalone
            bool done = false;
            //if ((params->windowX == 0) && (params->windowY == 1))
            for(int i=0; i<params->windowSize; i++)
            {
                for(int j=0; j<params->windowSize; j++)
                {
                    int length1 = params->pplX->starts[i*params->windowSize + j + 1] - params->pplX->starts[i*params->windowSize + j];
                    int length2 = params->pplY->starts[j*params->windowSize + i + 1] - params->pplY->starts[j*params->windowSize + i];
                    if(length1 != length2)
                    {
                        printf("x=%5d y=%5d vX=%5d vY=%5d\n", i, j, length1, length2);

//                        for(int k=params->pplX->starts[i*params->windowSize + j]; k<params->pplX->starts[i*params->windowSize + j + 1]; k++)
//                        {
//                            printf("x: %3d ", params->pplX->ppl[k] >> 20);
//                            printf("y: %3d ", (params->pplX->ppl[k] >> 8)&0xFFF);
//                            printf("i: %3d\n", params->pplX->ppl[k]&0xFF);
//                        }
//                        printf("================================================================\n");
//                        for(int k=params->pplY->starts[j*params->windowSize + i]; k<params->pplY->starts[j*params->windowSize + i + 1]; k++)
//                        {
//                            printf("x: %3d ", params->pplY->ppl[k] >> 20);
//                            printf("y: %3d ", (params->pplY->ppl[k] >> 8)&0xFFF);
//                            printf("i: %3d\n", params->pplY->ppl[k]&0xFF);
//                        }

                        done = true;
                        break;
                    }
                }
                if(done)
                    break;

            }
        }
        else if((params->windowX != params->windowY) && (!this->entireMatrix))
        {
            // If entireMatrix is set to false then we don't
            // compute elements above diagonal.

            //PPL
            params->pplY = new PartialPrimaryLibrary();
            params->pplY->allocateStarts(params->windowSize);
            for(int i=0; i<=(params->windowSize*params->windowSize); i++)
                params->pplY->starts[i] = 0;
        }

        printf("Kernel[%5d] %5dms %5dms %5dms %5dms %5dms %5dms = %5dms\n",
               params->partId,
               timeNW,
               timeNWreorder,
               timeWE,
               timeWEreorder,
               timeSM,
               timePrePPLtoPPL,
               timeNW + timeNWreorder + timeWE + timeWEreorder + timeSM + timePrePPLtoPPL);


        cudaFree(prePPLsDevPtr);
        cudaFree(outPrePPLsDevPtr);
        cudaFree(lengthsDevPtr);
        cudaFree(outPPLDevPtr);

        delete[] lengthsHostPtr;
                

/*******************************************************************************
* FINISHING                                                                    *
********************************************************************************/

        checkError("errors in PPBuilder");

        //dealocation memory on CPU
        delete[] outPrePPLHostPtr;

    }
    catch (Exception* ex)
    {
        printf("%s\n",ex->getMessage());
    }

}
