/*
 * nws_algorithm.cpp
 *
 *  Created on: Oct 15, 2011
 *      Author: michal
 */

#include <limits.h>
#include <cuda_runtime.h>
#include <math.h>

#ifdef MPI_ASM_ALIGN
#include <mpi.h>
#endif /* MPI_ASM_ALIGN */

#include "constants.h"
#include "nws_algorithm_option.h"
#include "exception.h"
#include "thread_manager.h"
#include "nws_single_gpu_runnable_init.h"
#include "nws_single_gpu_runnable.h"
#include "nws_single_gpu_runnable_finalization.h"

using std::vector;
using std::string;
using std::map;

using align::NWSAlgorithmOption;
using align::NWSSingleGPURunnableInit;
using align::NWSSingleGPURunnable;
using align::NWSSingleGPURunnableFinalization;
using core::Exception;


int NWSAlgorithmOption::mpi_node_id;
int NWSAlgorithmOption::mpi_node_count;

vector<string> &NWSAlgorithmOption::getCommands(void) {
	vector<string> &result = Option::getCommands();
	if (result.size() == 0) {
		result.push_back("nws");
	}
	return result;
}


void NWSAlgorithmOption::executeOption(map<string, void *> &params) {

	seqs = (Sequences*)params["seq"];
	sm = (SubstitutionMatrix*)params["sm"];
	p = (Pairs*)params["pairs"];
	gold = (bool)params["gold"];
	gapPenalty = (int)(long long)params["gp"];
        desiredGpus = (int *)params["gpus"];
        desiredGpusNumber = (int)(long long)params["gpusNumber"];
	verify = (bool)params["verify-results"];
        pairsPerGPU = (int)(long long)params["pairsPerGpu"];
        resultsFileName = (string *)params["results"];
        if (pairsPerGPU == 0)
            pairsPerGPU = 16384;
        
//#ifdef MPI_ASM_ALIGN
//        mpi_node_id = (int)(long long)params["mpi_node_id"];
//        mpi_node_count = (int)(long long)params["mpi_node_count"];
//#else
//        mpi_node_id = 0;
//        mpi_node_count = 1;
//#endif /* MPI_ASM_ALIGN */
        

	//printf("%d\n", gold);
        double computationTime = 0;
        double communicationTime = 0;
	
	seqs->packSequences(sm);
	seqs->testPacking();
	p->sort(seqs);
        
        #ifdef MPI_ASM_ALIGN
        HiResTimer timer;
        MPI_Barrier(MPI_COMM_WORLD);
        timer.start();

        p->bcast();
        seqs->bcast(sm);
        
        MPI_Barrier(MPI_COMM_WORLD);
        timer.stop();
        communicationTime = timer.getElapsedTime()/1000.0;
        #endif

	results = new Results(p->pairsCount);
	
	cells = 0;
        for (unsigned int i = pairsPerGPU * mpi_node_id;
                i < p->pairsCount;
                i += pairsPerGPU * mpi_node_count)
        {
            for (unsigned int j=i; (j<i+pairsPerGPU) && (j < p->pairsCount); j++)
                cells += seqs->lengths[ p->pair1[j] ] * seqs->lengths[ p->pair2[j] ];
        }
        

	if(gold)
	{
            HiResTimer timerCPU;
            timerCPU.start();
            runAllOnCPU();
            timerCPU.stop();
            printf("%.2fms\n", timerCPU.getElapsedTime()/1000.0);
	}
	else
	{
            computationTime = runAllOnGPU();
            if (verify)
               compareAllGPUtoCPU();
	}
        
        #ifdef MPI_ASM_ALIGN
        MPI_Barrier(MPI_COMM_WORLD);
        
        if(NWSAlgorithmOption::mpi_node_id == 0)
        {
            unsigned long long totalCells = 0;
            for(unsigned int i = 0; i < p->pairsCount; i++)
            {
                totalCells += seqs->lengths[ p->pair1[i] ] * seqs->lengths[ p->pair2[i] ];
            }
            
            double nodeTimes[NWSAlgorithmOption::mpi_node_count];
            MPI_Status status[NWSAlgorithmOption::mpi_node_count];
            nodeTimes[0] = communicationTime + computationTime;
            
            for(int i=1; i<NWSAlgorithmOption::mpi_node_count; i++)
            {
                MPI_Recv(&nodeTimes[i], 1, MPI_DOUBLE, MPI_ANY_SOURCE, 42, MPI_COMM_WORLD, status);
            }
            
            double maxTotalTime = 0;
            for(int i=0; i<NWSAlgorithmOption::mpi_node_count; i++)
            {
                if(nodeTimes[i] > maxTotalTime)
                    maxTotalTime = nodeTimes[i];
            }
            
            printf("TOTAL GCUPS=%.2f\n", ((double)totalCells / (double)maxTotalTime)/1000000.0 );
            printf("TOTAL time=[%.2fms]\n", maxTotalTime);
            printf("Approx. communication time=[%.2fms]\n", communicationTime);
            printf("Approx. computation time=[%.2fms]\n", maxTotalTime - communicationTime);
            printf("Cells=[%llu]\n", totalCells);
            
        }
        else
        {
            double myTotalTime = communicationTime + computationTime;
            MPI_Send(&myTotalTime, 1, MPI_DOUBLE, 0, 42, MPI_COMM_WORLD);
        }
        
        
        #endif
        
	
//	for(int i=0; i<p->pairsCount; i++)
//	{
//	    printf("[%4d %4d] %3d %3d\n", p->pair1[i], p->pair2[i], results->scores_overlaps[i].x, results->scores_overlaps[i].y);
//	}
        if (resultsFileName)
		results->save(*resultsFileName, p, pairsPerGPU, mpi_node_count, mpi_node_id);
	delete results;
}

void NWSAlgorithmOption::runAllOnCPU()
{
	for(int i=0; i<p->pairsCount; i++)
	{
		//printf("%d %d\n", p->pair1[i], p->pair2[i]);
		results->scores_overlaps[i] = runSingleNWSOnCPUs(i);
		//printf("%d %d\n", results->scores_overlaps[i].x, results->scores_overlaps[i].y);
	}
}

int2 NWSAlgorithmOption::runSingleNWSOnCPUs(int pairID)
{
	int seq1ID = p->pair1[pairID];
	int seq2ID = p->pair2[pairID];
	int length1 = seqs->lengths[seq1ID];
	int length2 = seqs->lengths[seq2ID];
	int start1 = seqs->starts[seq1ID];
	int start2 = seqs->starts[seq2ID];
	unsigned int *packedSeq1 = &seqs->packedSeqs[start1];
	unsigned int *packedSeq2 = &seqs->packedSeqs[start2];
	
	short *H = new short[length1];
	for(int i=0; i<length1; i++)
		H[i] = 0;
	
	int2 scores_overlaps;
	scores_overlaps.x = 0;
	
	for (int j = 0; j < length2; ++j)
	{
		// UNPACKING RESIDUE FROM (VERTIVAL) SEQ2
		unsigned int res2 = packedSeq2[j/seqs->residuesPer32bit];
		res2 >>= (j%seqs->residuesPer32bit) * seqs->bits;
		res2 &= seqs->mask;

		short H_upleft = 0;
		short H_left = 0;
		
		for (int i = 0; i < length1; ++i)
		{
			// UNPACKING RESIDUE FROM (HORIZONTAL) SEQ1
			unsigned int res1 = packedSeq1[i/seqs->residuesPer32bit];
			res1 >>= (i%seqs->residuesPer32bit) * seqs->bits;
			res1 &= seqs->mask;
			
			H_left = MAX(H_left - gapPenalty, H_upleft + sm->getNotPacked(res1, res2));
			H_left = MAX(H[i]   - gapPenalty, H_left);
						
			H_upleft = H[i];
			
			H[i] = H_left;
			//if (pairID == 0)
			//	printf("%3d", H_left);
                        
//                        if (pairID == 1)
//                                printf("%d;", H_left);
		}
//                if (pairID == 1)
//                        printf("\n");
                
		scores_overlaps.x = MAX(scores_overlaps.x, H[length1-1]);
		if(scores_overlaps.x == H[length1-1])
			scores_overlaps.y = length2 - 1 - j;
		
		//if (pairID == 0)
		//	printf("\n");
	}
	
	for (int i = 0; i < length1-1; ++i)
	{
		scores_overlaps.x = MAX(scores_overlaps.x, H[i]);
		if(scores_overlaps.x == H[i])
			scores_overlaps.y = -(length1 - 1 - i);
		
//		if(scores_overlaps.x <= H[i])
//		{
//		    scores_overlaps.x = H[i];
//		    scores_overlaps.y = -(length1 - 1 - i);
//		}
	}
	
	delete[] H;
	
	return scores_overlaps;
}

void NWSAlgorithmOption::compareAllGPUtoCPU()
{
    for (unsigned int i = pairsPerGPU * mpi_node_id;
            i < p->pairsCount;
            i += pairsPerGPU * mpi_node_count)
    {

        for(unsigned int j=i; (j<i+pairsPerGPU) && (j<p->pairsCount); j++)
        {
            if(!compareSingleGPUtoCPU(j))
            {
                printf("ERR: pair[%d]=[%d, %d], GPU=[%d, %d]\n", j, p->pair1[j], p->pair2[j], results->scores_overlaps[j].x, results->scores_overlaps[j].y);
                //printf("ERR%d\n", i);
                throw (new Exception())->setMessage("ERROR: GPU and CPU results are different!");
            }
        }
    }
    //printf("GPU and CPU results are equivalent\n");
}

bool NWSAlgorithmOption::compareSingleGPUtoCPU(int pairID)
{
    int seq1ID = p->pair1[pairID];
    int seq2ID = p->pair2[pairID];
    int length1 = seqs->lengths[seq1ID];
    int length2 = seqs->lengths[seq2ID];
    int start1 = seqs->starts[seq1ID];
    int start2 = seqs->starts[seq2ID];
    unsigned int *packedSeq1 = &seqs->packedSeqs[start1];
    unsigned int *packedSeq2 = &seqs->packedSeqs[start2];
    
    short *H = new short[length1];
    for(int i=0; i<length1; i++)
        H[i] = 0;

    short GPUscore   = results->scores_overlaps[pairID].x;
    short GPUoverlap = results->scores_overlaps[pairID].y;
    short CPUscore = 0;
    
    for (int j = 0; j < length2; ++j)
    {
        // UNPACKING RESIDUE FROM (VERTIVAL) SEQ2
        unsigned int res2 = packedSeq2[j/seqs->residuesPer32bit];
        res2 >>= (j%seqs->residuesPer32bit) * seqs->bits;
        res2 &= seqs->mask;

        short H_upleft = 0;
        short H_left = 0;
        
        for (int i = 0; i < length1; ++i)
        {
            // UNPACKING RESIDUE FROM (HORIZONTAL) SEQ1
            unsigned int res1 = packedSeq1[i/seqs->residuesPer32bit];
            res1 >>= (i%seqs->residuesPer32bit) * seqs->bits;
            res1 &= seqs->mask;
            
            H_left = MAX(H_left - gapPenalty, H_upleft + sm->getNotPacked(res1, res2));
            H_left = MAX(H[i]   - gapPenalty, H_left);
                        
            H_upleft = H[i];
            
            H[i] = H_left;
        }
        CPUscore = MAX(CPUscore, H[length1-1]);
        if(length2 - 1 - j == GPUoverlap)
            if(H[length1-1] != GPUscore)
            {
//                delete[] H;
//                return false;
            }
        
    }
    
    for (int i = 0; i < length1-1; ++i)
    {
        CPUscore = MAX(CPUscore, H[i]);
        if(-(length1 - 1 - i) == GPUoverlap)
            if(H[i] != GPUscore)
            {
//                delete[] H;
//                return false;
            }
                
    }
    
    
    delete[] H;
    
    if(GPUscore != CPUscore)
    {
        printf("CPU SCORE=%d  GPU SCORE=%d\n", CPUscore, GPUscore);
        return false;
    }
    
    return true;
}

void NWSAlgorithmOption::deviceQuery() {
    
    cudaGetDeviceCount(&gpuNumber);
    printf("Device count: %d\n", gpuNumber);

    if(gpuNumber == 0)
        throw (new Exception())->setMessage("ERROR: No gpus to use!");
    
    gpus = new int[gpuNumber];
    if(desiredGpus)
    {
        printf("Using devices(s): ");

        int selectedGpuNumber = 0;
        for(int i = 0; i < desiredGpusNumber; i++)
        {
            if(desiredGpus[i] < gpuNumber)
            {
                //TODO: sprawdzić czy nie wybieramy 2 razy tego samego GPU
                gpus[selectedGpuNumber++] = desiredGpus[i];
                printf("%d ", desiredGpus[i]);
                if(selectedGpuNumber == gpuNumber)
                    break; // no more gpus available on the system
            }
        }
        printf("\n");
        gpuNumber = selectedGpuNumber; // not all gpus have to be used
    }
    else
    {
        for(int i=0; i<gpuNumber; i++)
            gpus[i] = i;
 
    }
    if(gpuNumber == 0)
        throw (new Exception())->setMessage("ERROR: No gpus to use!");

#ifndef MPI_ASM_ALIGN
    maxMemory = INT_MAX;
    cudaDeviceProp devProp;
    
    for(int i = 0; i < gpuNumber; i++)
    {
        int gpuNo = gpus[i];
        cudaSetDevice(gpuNo);
    
        char* devPtr;
        cudaMalloc((void**)&devPtr, sizeof(int));
        cudaMemcpy(devPtr, &gpuNo, sizeof(int), cudaMemcpyHostToDevice);
    
        gpuNo = -1;
        cudaMemcpy(&gpuNo, devPtr, sizeof(int), cudaMemcpyDeviceToHost);
        cudaFree(devPtr);
        
        cudaGetDeviceProperties(&devProp, gpuNo);
        maxMemory = MIN(maxMemory, devProp.totalGlobalMem);
        
        printf("GPU: %d\n",gpuNo);   
    }
    printf("Max mem. to use: %dMB\n", (int)(maxMemory/(1024*1024)));
#endif

}

typedef vector<NWSSingleGPURunnable *>::iterator NWSSGRIterator;

double NWSAlgorithmOption::runAllOnGPU() {
    deviceQuery();
    ThreadManager *tm = new ThreadManager(gpus, gpuNumber);
    NWSSingleGPURunnableInit *init = new NWSSingleGPURunnableInit;
    NWSSingleGPURunnableFinalization *finalize = new NWSSingleGPURunnableFinalization;
    fasterKernelInvocationCount = 0;
    slowerKernelInvocationCount = 0;
    
    int blocks = ceil(pairsPerGPU*1.0 / BLOCK_SIZE);
    int memOffset = ((blocks * BLOCK_SIZE - 1)/128 + 1) * 128;
    
    init->setMemoryOffset(memOffset);
    init->setNWAAlgorithm(this);
    init->setThreadManager(tm);
    
    for (int i = 0; i < gpuNumber; i++)
        tm->request(init, gpus[i]);
    tm->wait();
    
    vector<NWSSingleGPURunnable *> runs;
    //int packageCount = (p->pairsCount - 1) / pairsPerGPU + 1;
    HiResTimer timer;
    timer.start();
    
    for (unsigned int pairToStartWith = pairsPerGPU*mpi_node_id;
            pairToStartWith < p->pairsCount;
            pairToStartWith += pairsPerGPU*mpi_node_count)
    {
        NWSSingleGPURunnable *run = new NWSSingleGPURunnable;
        run->setMemoryOffset(memOffset);
        run->setPairIdToStartWith(pairToStartWith);
        run->setThreadManager(tm);
        run->setNWAAlgorithm(this);
        runs.push_back(run);
        tm->request(run, ThreadManager::anyGpu);
    }
    tm->wait();
    timer.stop();
    printf("GCUPS=%.2f [%.2fms]\n", ((double)cells) / ((double)timer.getElapsedTime())/1000, timer.getElapsedTime()/1000.0);
    printf("FasterKernelCount: %d, SlowerKernelCount: %d\n", fasterKernelInvocationCount, slowerKernelInvocationCount);
    
    finalize->setNWAAlgorithm(this);
    finalize->setThreadManager(tm);
    
    for (int i = 0; i < gpuNumber; i++)
        tm->request(finalize, gpus[i]);
    tm->wait();
//    for (int i = 0; i < )
    
    // CLEANING
    delete init;
    delete finalize;
    for (NWSSGRIterator it = runs.begin(); it != runs.end(); ++it) {
        delete *it;
    }
    
    return (timer.getElapsedTime()/1000.0);
}




