/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 4 -*- */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Initial Developer of the Original Code is
 * University of Szeged.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Department of Software Engineering
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#ifdef ENABLE_PROFILE_ARM

#include "ARMProfiler.h"

// memory handling
#include <stdlib.h>

// provides clock() and time() functions
#include <time.h>

namespace ARMProfiler {

// Depending on different ARM cores, we might support performance counters
struct ProfileEntity {
    uint64_t totalTime;
    uint32_t count;
    const char* functionName;
    const char* opcodeDescriptor;
};

// CLOCKS_PER_SEC is 1000000 on ARM (the time overflows after 4000 sec ~ 1.1 hour)

struct StackEntity {
    uint32_t index;
    clock_t startClock;
};

// Help to free memory

struct MemoryEntity {
    void* ptr;
};

struct ProfileGlobals {
    ProfileEntity* entities;
    uint32_t numberOfEntities;
    StackEntity* stack;
    uint32_t stackDepth;
    uint32_t maxStackDepth;
    MemoryEntity* freeBlocks;
    uint32_t numberOfBlocks;
};

// TODO: When we support threads, it should be a
// thread local variable (and initialized to 0)
ProfileGlobals profileGlobals;

// All must be power of 2
#define ENTITY_ARRAY_GROW 64
#define STACK_ARRAY_GROW 128
#define MEMORY_ARRAY_GROW 64

uint32_t addProfileEvent(const char* functionName, const char* opcodeDescriptor)
{
    if ((profileGlobals.numberOfEntities & (ENTITY_ARRAY_GROW - 1)) == 0) {
        // TODO: throw out-of-memory exception
        profileGlobals.entities = (ProfileEntity*)realloc(profileGlobals.entities,
            (profileGlobals.numberOfEntities + ENTITY_ARRAY_GROW) * sizeof(ProfileEntity));
    }

    profileGlobals.entities[profileGlobals.numberOfEntities].totalTime = 0;
    profileGlobals.entities[profileGlobals.numberOfEntities].count = 0;
    profileGlobals.entities[profileGlobals.numberOfEntities].functionName = functionName;
    profileGlobals.entities[profileGlobals.numberOfEntities].opcodeDescriptor = opcodeDescriptor;
    return profileGlobals.numberOfEntities++;
}

void printProfile(FILE* output)
{
    // TODO: sorting?

    if (profileGlobals.stackDepth != 0)
        fprintf(output, "WARNING: stackDepth is non-zero (%d)\n", profileGlobals.stackDepth);

    fprintf(output, "========================================\n"
                    "function + opcode detailed statistics\n"
                    "========================================\n\n");
    for (uint32_t i = 0; i < profileGlobals.numberOfEntities; ++i) {
        ProfileEntity* entity = &profileGlobals.entities[i];
        if (entity->functionName)
            fprintf(output, "function: %s ", entity->functionName);
        if (entity->opcodeDescriptor)
            fprintf(output, "inst: %s ", entity->opcodeDescriptor);
        fprintf(output, "count: %d avg_time: %lld ms\n",
            entity->count, entity->count ? ((entity->totalTime * 1000) / (CLOCKS_PER_SEC * (uint64_t)entity->count)) : 0);
    }

#if 0
    fprintf(output, "\n==============================\n"
                    "aggregated opcode statistics\n"
                    "================================\n\n");
    // This loop overwrites the opcodeDescriptor, which is safe because it is not used anymore
    for (uint32_t i = 0; i < profileGlobals.numberOfEntities; ++i) {
        if (profileGlobals.entities[i].opcodeDescriptor) {
            const char* opcodeDescriptor = profileGlobals.entities[i].opcodeDescriptor;
            uint64_t totalTime = profileGlobals.entities[i].totalTime;
            uint32_t count = profileGlobals.entities[i].count;

            for (uint32_t j = i + 1; j < profileGlobals.numberOfEntities; ++j)
                if (profileGlobals.entities[j].opcodeDescriptor == opcodeDescriptor) {
                    totalTime += profileGlobals.entities[j].totalTime;
                    count += profileGlobals.entities[j].count;
                    profileGlobals.entities[j].opcodeDescriptor = NULL;
                }
            fprintf(output, "inst: %s count: %d avg_time: %lld\n", opcodeDescriptor,
                count, count ? (totalTime / count) : 0 /* * 1000 / CLOCKS_PER_SEC */);
        }
    }
#endif

    for (uint32_t i = 0; i < profileGlobals.numberOfBlocks; ++i)
        free(profileGlobals.freeBlocks[i].ptr);

    if (profileGlobals.entities)
        free(profileGlobals.entities);
    if (profileGlobals.stack)
        free(profileGlobals.stack);
    if (profileGlobals.freeBlocks)
        free(profileGlobals.freeBlocks);
}

void autoFreeMemory(void* ptr)
{
    if ((profileGlobals.numberOfBlocks & (MEMORY_ARRAY_GROW - 1)) == 0) {
        // TODO: throw out-of-memory exception
        profileGlobals.freeBlocks = (MemoryEntity*)realloc(profileGlobals.freeBlocks,
            (profileGlobals.numberOfBlocks + MEMORY_ARRAY_GROW) * sizeof(MemoryEntity));
    }

    profileGlobals.freeBlocks[profileGlobals.numberOfBlocks].ptr = ptr;
    ++profileGlobals.numberOfBlocks;
}

void startSampling(uint32_t index)
{
    if (profileGlobals.stackDepth >= profileGlobals.maxStackDepth) {
        profileGlobals.maxStackDepth += STACK_ARRAY_GROW;
        profileGlobals.stack = (StackEntity*)realloc(profileGlobals.stack, 
            (profileGlobals.maxStackDepth) * sizeof(StackEntity));
    }

    profileGlobals.stack[profileGlobals.stackDepth].startClock = clock();
    profileGlobals.stack[profileGlobals.stackDepth].index = index;
    ++profileGlobals.stackDepth;
}

void endSampling(uint32_t index)
{
    StackEntity* stackPtr = &profileGlobals.stack[profileGlobals.stackDepth - 1];
    ProfileEntity* entity = &profileGlobals.entities[stackPtr->index];
    entity->totalTime += (uint64_t)(clock() - stackPtr->startClock);
    entity->count++;

    if (index == StopSampling) {
        if (profileGlobals.stackDepth > 0)
            --profileGlobals.stackDepth;
        else
            printf("WARNING: stackDepth is already zero!\n");
    } else {
        stackPtr->startClock = clock();
        stackPtr->index = index;
    }
}

void popCallStack(uint32_t level)
{
    if (level >= profileGlobals.stackDepth)
        profileGlobals.stackDepth -= level;
    else {
        printf("WARNING: stackDepth goes below zero!\n");
        profileGlobals.stackDepth = 0;
    }
}

} // namespace ARMProfiler

#endif // ENABLE_PROFILE_ARM

