#include "extended_library_builder_gpu.cuh"
#include "extended_library.h"

using std::string;
using namespace Exceptions;
using namespace Data;
using Data::Sequences;
using Data::MatchesManager;
using Exceptions::Exception;



ExtendedLibraryBuilder::ExtendedLibraryBuilder()
{
    plb = new PrimaryLibraryBuilder();
}


void ExtendedLibraryBuilder::actuallyArgumentsLoading(ArgumentsManager* am)
{
    plb->actuallyArgumentsLoading(am);
    
    plb->entireMatrix = true; // we have to compute entire matrix here


    if(am->containParam("eloutfile", "elo"))
        this->eloutfile = am->getParam("eloutfile", "elo");
    else
        this->eloutfile = NULL;

    this->winHeight = am->getIntParam("extlibwin", "elw" );
}


/*******************************************************************************
 * Returns path to the file with default settings for the algorithm            *
 *******************************************************************************/

char* ExtendedLibraryBuilder::defaultSettingsFile()
{
    return "etc/elb-defaults.cnf";
}


void ExtendedLibraryBuilder::run(ThreadManager* tm)
{
    plb->library = new ExtendedLibrary();
    plb->run(tm);
    ExtendedLibrary* extLib = (ExtendedLibrary*) plb->library;
    extLib->allocateEL();

    //actual extended library building
    //HERE TASKS ARE DEFINED

    HiResTimer timer;
    timer.start();


    sequenceNumber = this->plb->seqs->getSequenceNumber();
    windowsNumber = (sequenceNumber - 1) / winHeight + 1;
    int partId = 1;

    ELWindowParams* params;
    ELWindowParams** jobs = new ELWindowParams*[windowsNumber];


    //long jobs first and short jobs last is better for load balancing
    for (int i = 0; i<windowsNumber; i++) //we iterate through all the windows
    {
        params = new ELWindowParams();
        params->windowY = i;
        params->partId = partId++;

        jobs[i] = params;
    }


    for (int i=windowsNumber-1; i>=0; i--) // shoretest jobs last
    {
        //printf("%d: %lld %d %d\n", jobs[i]->partId, jobs[i]->getEstimatedComplexity(), jobs[i]->seqs->getWindowSum(jobs[i]->windowSize, jobs[i]->windowX), jobs[i]->seqs->getWindowSum(jobs[i]->windowSize, jobs[i]->windowY));//
        RunnableWithParams* invokingParams = new RunnableWithParams();
        invokingParams->algorithm = this;
        invokingParams->params = jobs[i];
        tm->request(invoker, (void*)invokingParams, -1);
    }


    tm->wait();
    timer.stop();
    printf("%s total: %dms\n", getAlgorithmName(), (int) timer.getElapsedTime());

    for(int i=0; i<windowsNumber; i++)
        delete jobs[i];
    delete[] jobs;


    // Saving extended library to a file using T-Coffee file format.
    if(this->eloutfile != NULL)
    {
        timer.start();

        extLib->saveLibraryToFile(this->eloutfile, extLib, this->plb->seqs, this->plb->K);

        timer.stop();
        printf("Saving library to file %s: %dms\n", this->eloutfile, (int)timer.getElapsedTime());
    }

}


const char* ExtendedLibraryBuilder::getAlgorithmName()
{
    return "elb";
}


void ExtendedLibraryBuilder::actualInvokedMethod(void* voidParams)
{
    //THIS FUNCTION IS CALLED BY THREAD MANAGER

    ELWindowParams* params = (ELWindowParams*)voidParams;

    try
    {
        int winElements = sequenceNumber * winHeight; // sequenceNumber is "window length"

        //printf("%d   %d\n", params->windowY, this->windowsNumber);

        // computation of max elements count in a REINFORCING WINDOW
        unsigned int maxElementsCount = 0;
        for(int i=0; i<=params->windowY; i++)
        {
            int maxStartsIndex = MIN((i+1)*winElements, sequenceNumber*sequenceNumber);
            int elementsCount = plb->library->starts[maxStartsIndex] -
                                plb->library->starts[i*winElements];
            maxElementsCount = MAX(maxElementsCount, elementsCount);
        }


        // memory for 1st window (BASE WINDOW)
        int baseWinMaxStartsIndex = MIN((params->windowY+1)*winElements, sequenceNumber*sequenceNumber);
        int baseWinElemCount = plb->library->starts[baseWinMaxStartsIndex] -
                               plb->library->starts[params->windowY*winElements];
        unsigned int* baseWinDevPtr;
        saveMalloc(sizeof(int) * baseWinElemCount);
        cudaMalloc(&baseWinDevPtr, sizeof(int) * baseWinElemCount);

        // memory for 2nd window (REINFORCING WINDOW)
        unsigned int* reinforceWinDevPtr;
        saveMalloc(sizeof(int) * maxElementsCount);
        cudaMalloc(&reinforceWinDevPtr, sizeof(int) * maxElementsCount);

        // memory for results - extended library weights
        unsigned int* resultsDevPtr;
        saveMalloc(sizeof(int) * baseWinElemCount);
        cudaMalloc(&resultsDevPtr, sizeof(int) * baseWinElemCount);

        // memory for "starts" array
        unsigned int* startsDevPtr;
        saveMalloc(sizeof(int) * sizeof(int)*(sequenceNumber*sequenceNumber+1));
        cudaMalloc(&startsDevPtr, sizeof(int)*(sequenceNumber*sequenceNumber+1));
        cudaMemcpy(startsDevPtr, this->plb->library->starts, sizeof(int)*(sequenceNumber*sequenceNumber+1), cudaMemcpyHostToDevice);



        // copying data to BASE WINDOW
        unsigned int* myBaseWinHostPtr = &(this->plb->library->pl[ plb->library->starts[params->windowY*winElements] ]);
        cudaMemcpy(baseWinDevPtr, myBaseWinHostPtr, sizeof(int)*baseWinElemCount, cudaMemcpyHostToDevice);

//        // initializing "resultsDevPtr"
//        dim3 initBlockShape(256); // 256 x 1 x 1
//        dim3 initGridShape((baseWinElemCount-1)/initBlockShape.x +1);
//        initExtendedLibrary<<<initGridShape, initBlockShape>>>(baseWinDevPtr, resultsDevPtr, baseWinElemCount);

//        //TEST
//        unsigned int* resultsHostPtr = new unsigned int[baseWinElemCount];
//        cudaMemcpy(resultsHostPtr, resultsDevPtr, sizeof(int)*baseWinElemCount, cudaMemcpyDeviceToHost);
//        for(int i=0; i<baseWinElemCount; i++)
//            if(resultsHostPtr[i] != (myBaseWinHostPtr[i]&0xFF))
//                printf("lol\n");

        // GRID & BLOCK shape
        dim3 blockShape(256); // 256 x 1 x 1
        //int currWinWidth = sequenceNumber-(params->windowY*winHeight);
        // ecause when having a particular REIN. WINDOW
        // we can compute only those cells which correspond
        // to this window.
        dim3 gridShape(winHeight, winHeight);

        HiResTimer timer;
        int time;

        // To compute a particular window we have to have
        // all windows that are above our window
        // (this referrs to our case as we
        // compute only elements below diagonal).
        
        //for(int i=params->windowY; i<this->windowsNumber; i++) // above diagonal
        for (int i = 0; i <= params->windowY; i++) // below diagonal
        {
            // copying data to REINFORCING WINDOW
            int reinWinMaxStartsIndex = MIN((i+1)*winElements, sequenceNumber*sequenceNumber);
            int reinWinElemCount = plb->library->starts[reinWinMaxStartsIndex] -
                                   plb->library->starts[i*winElements];
            unsigned int* myReinWinHostPtr = &(this->plb->library->pl[ plb->library->starts[i*winElements] ]);
            cudaMemcpy(reinforceWinDevPtr, myReinWinHostPtr, sizeof(int)*reinWinElemCount, cudaMemcpyHostToDevice);

            timer.start();
            
            extLibBuilder<<<gridShape, blockShape>>>( baseWinDevPtr,
                                                      reinforceWinDevPtr,
                                                      startsDevPtr,
                                                      resultsDevPtr,
                                                      (unsigned short)sequenceNumber,
                                                      (unsigned short)winHeight,
                                                      (unsigned short)i, // "window X"
                                                      (unsigned short)params->windowY);
            cudaThreadSynchronize();
            timer.stop();
            time = timer.getElapsedTime();
            checkError("errors in ELBuilder");
            printf("Kernel[%5d,%5d] %5dms\n", params->windowY, i, time);
            
        }

        
//        // test of initialization "resultsDevPtr" array
//        unsigned int* resultsHostPtr = new unsigned int[baseWinElemCount];
//        cudaMemcpy(resultsHostPtr, resultsDevPtr, sizeof(int)*baseWinElemCount, cudaMemcpyDeviceToHost);
//        for(int j=params->windowY*winHeight; j<(params->windowY + 1)*winHeight; j++)// y
//            for(int i=0; i<sequenceNumber; i++)// x
//                if ( (i > j) && (j<sequenceNumber) )
//                    for (int k = plb->pl->starts[j*sequenceNumber + i]; k < plb->pl->starts[j*sequenceNumber + i + 1]; k++)
//                        if(resultsHostPtr[k - plb->pl->starts[params->windowY*winElements]] !=  (myBaseWinHostPtr[k - plb->pl->starts[params->windowY*winElements]]&0xFF))
//                        {
//                            printf("%d   %d", i, j);
//                            //printf("%d   %d\n", resultsHostPtr[k - plb->pl->starts[params->windowY*winElements]],  (myBaseWinHostPtr[k - plb->pl->starts[params->windowY*winElements]]&0xFF));
//                            break;
//                        }


        ExtendedLibrary* extLib = (ExtendedLibrary*) plb->library;

//        printf("%d   %d    %d\n", plb->library->starts[params->windowY*winElements],
//                                  plb->library->totalNumOfElements,
//                                  baseWinElemCount);
        unsigned int* resultsHostPtr = &(extLib->el[ plb->library->starts[params->windowY*winElements] ]);
        //unsigned int* resultsHostPtr = new unsigned int[baseWinElemCount];
        cudaMemcpy(resultsHostPtr, resultsDevPtr, sizeof(int)*baseWinElemCount, cudaMemcpyDeviceToHost);


//        int xx = params->windowY*winHeight + 2;
//        int yy = params->windowY*winHeight + 0;
//
//        for (int k = plb->library->starts[yy*sequenceNumber + xx]; k < plb->library->starts[yy*sequenceNumber + xx + 1]; k++)
//            if((myBaseWinHostPtr[k - plb->library->starts[params->windowY*winElements]]&0xFF) != resultsHostPtr[k - plb->library->starts[params->windowY*winElements]])
//                printf("pl=%d    ext=%d\n", myBaseWinHostPtr[k - plb->library->starts[params->windowY*winElements]]&0xFF, resultsHostPtr[k - plb->library->starts[params->windowY*winElements]]);

        //delete[] resultsHostPtr;


        // memory dealocation
        cudaFree(reinforceWinDevPtr);
        cudaFree(baseWinDevPtr);
        cudaFree(resultsDevPtr);
        cudaFree(startsDevPtr);

    }
    catch (Exception* ex)
    {
        printf("%s\n",ex->getMessage());
    }

}
