#!/usr/bin/env python

import nsysstats
import time

class CUDAGPULowUtilization(nsysstats.Report):

    ROW_LIMIT = 50
    THRESHOLD = 50
    NUM_CHUNKS = 30

    usage = f"""{{SCRIPT}}[:rows=<limit>][:threshold=<percent>][:chunks=<number>] -- CUDAGPULowUtilization

    Options:
        rows=<limit> - Maximum number of rows returned by the query.
            Default is {ROW_LIMIT}.

        threshold=<percent> - Maximum percentage of time the GPU is being used.
            Default is {THRESHOLD}%.

        chunks=<number> - Number of equal-duration chunks in which the GPU time
            utilization is calculated for each device in a process. If the profile
            duration cannot be evenly divided by <number>, the duration of every
            chunk is rounded up to the nearest integer nanosecond, such that all
            chunks still have the same duration. Due to this rounding:
                - The last chunk may overlap the end of the profiling duration,
                  effectively making the active chunk duration smaller than the
                  other chunks. This difference is accounted for in the in-use
                  percent calculation.
                - In extreme cases, the actual number of active chunks can be
                  smaller than <number>.
            Possible values are integers between 1 and 1000.
            Default is {NUM_CHUNKS}.

    Output: All time values default to nanoseconds
        Row# : Row number of chunk
        In-Use% : Percentage of time the GPU is being used
        Duration : Duration of chunk
        Start : Start time of chunk
        PID : Process identifier
        Device ID : GPU device identifier

    This rule identifies time regions with low GPU utilization. For each
    process, each GPU device is examined, and a time range is created that
    starts with the beginning of the first GPU operation on that device and
    ends with the end of the last GPU operation on that device. This time range
    is then divided into equal chunks, and the GPU utilization is calculated
    for each chunk. The utilization includes all GPU operations as well as CUDA
    profiling overheads that the user cannot address.

    Note that the utilization refers to the "time" utilization and not the
    "resource" utilization. This script attempts to find time gaps when the GPU
    is or isn't being used, but does not take into account how many GPU
    resources are being used. Therefore, a single running memcpy is considered
    the same amount of "utilization" as a huge kernel that takes over all the
    cores. If multiple operations run concurrently in the same chunk, their
    utilization will be added up and may exceed 100%.

    Chunks with an in-use percentage less than the threshold value are displayed.
    If consecutive chunks have a low in-use percentage, the individual chunks are
    coalesced into a single display record, keeping the weighted average of
    percentages. This is why returned chunks may have different durations.
"""

    create_chunk_table = """
    CREATE TEMP TABLE CHUNK (
        rangeId   INTEGER PRIMARY KEY   NOT NULL
    )
"""

    insert_chunk_table = """
    INSERT INTO temp.CHUNK
    WITH RECURSIVE
        range AS (
            SELECT
                0 AS rangeId
            UNION ALL
            SELECT
                rangeId + 1 AS rangeId
            FROM
                range
            LIMIT {NUM_CHUNKS}
        )
    SELECT rangeId FROM range
"""

    query_format_columns = """
    SELECT
        ROW_NUMBER() OVER(ORDER BY average, duration) AS "Row#",
        average AS "In-Use%",
        duration AS "Duration:dur_ns",
        start AS "Start:ts_ns",
        pid AS "PID",
        deviceId AS "Device ID",
        globalPid AS "_Global PID"
    FROM
        ({CHUNKS})
"""

# Return chunks that have an average GPU utilization below the given threshold.
# 1. CTE "range": Define the range being analyzed for each deviceId/PID with
#    the corresponding chunk size.
# 2. CTE "chunk": Duplicate chunks for each deviceId/PID with the appropriate
#    start and end.
# 3. CTE "utilization": Find all ranges being run in each chunk and keep only
#    the ones that have a percentage of GPU utilization lower than the threshold.
#    If there are multiple streams, the utilizations are added up.
# 4. CTE "chunkgroup": Give a groupId that will be used to define consecutive
#    chunks.
# 5. Coalesce chunks with same groupId and calculate the weighted average.
    query_chunk = """
    WITH
        ops AS (
            {OPS_ALL}
        ),
        range AS (
            SELECT
                min(start) AS start,
                max(end) AS end,
                ceil(CAST(max(end) - min(start) AS FLOAT) / {NUM_CHUNKS}) AS chunkSize,
                pid,
                globalPid,
                deviceId
            FROM
                ops
            GROUP BY deviceId, pid
        ),
        chunk AS (
            SELECT
                chunk.rangeId,
                chunk.rangeId * range.chunkSize + range.start AS cstart,
                min(chunk.rangeId * range.chunkSize + range.start + range.chunkSize, range.end) AS cend,
                chunkSize,
                range.pid,
                range.globalPid,
                range.deviceId
            FROM
                temp.CHUNK AS chunk
            JOIN
                range
                ON cstart < cend
        ),
        utilization AS (
            SELECT
                chunk.rangeId,
                chunk.cstart AS start,
                chunk.cend AS end,
                chunk.cend - chunk.cstart AS size,
                sum(CAST(coalesce(min(ops.end, chunk.cend) - max(ops.start, chunk.cstart), 0) AS FLOAT)) / (chunk.cend - chunk.cstart) * 100 AS timePercentage,
                chunk.pid,
                chunk.globalPid,
                chunk.deviceId
            FROM
                chunk
            LEFT JOIN
                ops
                ON      ops.deviceId = chunk.deviceId
                    AND ops.pid = chunk.pid
                    AND ops.start < chunk.cend
                    AND ops.end > chunk.cstart
            GROUP BY
                chunk.rangeId, ops.deviceId, ops.pid
            HAVING
                timePercentage < {THRESHOLD}
        ),
        chunkgroup AS
        (
            SELECT
                *,
                rangeId - ROW_NUMBER() OVER (PARTITION BY pid, deviceId ORDER BY rangeId) AS groupId
            FROM
                utilization
        )
    SELECT
        min(start) AS start,
        max(end) - min(start) AS duration,
        round(sum(size * timePercentage) / sum(size), 1) AS average,
        pid,
        globalPid,
        deviceId
    FROM
        chunkgroup
    GROUP BY groupId, deviceId, pid
    LIMIT {ROW_LIMIT}
"""

# Select columns of kernel/memory operations.
    query_select = """
    SELECT
        start,
        end,
        (globalPid >> 24) & 0x00FFFFFF AS pid,
        globalPid,
        deviceId
    FROM
        {GPU_OPERATION}
"""

# Combine kernel/memory operations.
    query_union = """
        UNION ALL
"""

# Add the profiler overhead to the GPU operation table returned by
# "query_union".
# 1. CTE "range": Get [min(start), max(end)] for each deviceId/PID. It will be
#    used as the clipping range for overheads.
# 2. CTE "cudaoverhead": Select CUDA profiling overhead that we want to take
#    into account.
# 3. Duplicate overhead rows for each deviceId/PID. This will create a deviceId
#    column that is not initially in the PROFILER_OVERHEAD table. i.e., a CUDA
#    profiling overhead on one thread affects all GPUs of the same process.
# 4. The overhead rows are combined with GPU operation rows.
    query_overhead = """
    WITH
        gpuops AS (
            {GPU_OPS_ALL}
        ),
        range AS (
            SELECT
                min(start) AS start,
                max(end) AS end,
                pid,
                globalPid,
                deviceId
            FROM
                gpuops
            GROUP BY deviceId, pid
        ),
        cudaoverheadID AS (
            SELECT
                id
            FROM
                StringIds
            WHERE
                value = 'CUDA profiling data flush overhead'
                OR value = 'CUDA profiling stop overhead'
                OR value = 'CUDA profiling overhead'
        ),
        cudaoverhead AS (
            SELECT
                po.start,
                po.end,
                (po.globalTid >> 24) & 0x00FFFFFF AS pid
            FROM
                PROFILER_OVERHEAD AS po
            JOIN
                cudaoverheadID AS co
                ON co.id = po.nameId
        )
    SELECT
        co.start,
        co.end,
        co.pid,
        range.globalPid,
        range.deviceId
    FROM
        cudaoverhead AS co
    JOIN
        range
        ON
                co.pid = range.pid
            AND co.start > range.start
            AND co.end < range.end
    UNION ALL
    SELECT
        *
    FROM
        gpuops
"""

    def setup(self):
        err = super().setup()
        if err != None:
            return err

        row_limit = self.ROW_LIMIT
        threshold = self.THRESHOLD
        num_chunks = self.NUM_CHUNKS

        for arg in self.args:
            s = arg.split('=')
            if len(s) == 2 and s[1].isdigit():
                if s[0] == 'rows' and s[1].isdigit():
                    row_limit = s[1]
                    continue
                if s[0] == 'threshold' and s[1].isdigit():
                    threshold = s[1]
                    continue
                if s[0] == 'chunks' and s[1].isdigit() and 1 <= int(s[1]) <= 1000:
                    num_chunks = s[1]
                    continue
            exit(self.EXIT_INVALID_ARG)

        self.statements = [
            self.create_chunk_table,
            self.insert_chunk_table.format(NUM_CHUNKS = num_chunks)]

        sub_queries = []

        kernel = 'CUPTI_ACTIVITY_KIND_KERNEL'
        memcpy = 'CUPTI_ACTIVITY_KIND_MEMCPY'
        memset = 'CUPTI_ACTIVITY_KIND_MEMSET'
        overhead = 'PROFILER_OVERHEAD'

        if self.table_exists(kernel):
            sub_queries.append(self.query_select.format(GPU_OPERATION = kernel))

        if self.table_exists(memcpy):
            sub_queries.append(self.query_select.format(GPU_OPERATION = memcpy))

        if self.table_exists(memset):
            sub_queries.append(self.query_select.format(GPU_OPERATION = memset))

        if len(sub_queries) == 0:
            return "{DBFILE} could not be analyzed because it does not contain CUDA trace data."

        union = self.query_union.join(sub_queries)

        if self.table_exists(overhead):
            union = self.query_overhead.format(
                GPU_OPS_ALL = union)

        chunks = self.query_chunk.format(
            OPS_ALL = union,
            NUM_CHUNKS = num_chunks,
            THRESHOLD = threshold,
            ROW_LIMIT = row_limit)

        self.query = self.query_format_columns.format(
            CHUNKS = chunks)

if __name__ == "__main__":
    CUDAGPULowUtilization.Main()
