-1

I'm having trouble with the shape of a struct to pass to an NVML Library function via pinvoke. The struct contains a fixed size array and some unsigned long long's I'm not encountering any compiler errors but the data returned seems to be corrupted. (An int isRunning should only be a zero or a 1, I'm getting a 30, a 3...) Here is it's declaration: Accounting struct

Public Variables

unsigned int  gpuUtilization
unsigned int  isRunning //Flag to represent if the process is running (1 for running, 0 for terminated). 
unsigned long long  maxMemoryUsage
unsigned int  memoryUtilization
unsigned int  reserved[5] //Reserved for future use. 
unsigned long long  startTime //CPU Timestamp in usec representing start time for the process. 
unsigned long long  time

My code(minimal repeatable version, console c# app):

// c# version of accounting struct:
        public struct nvmlAccountingStats_t
        {
            public uint gpuUtilization;
            public uint isRunning;
            public ulong maxMemoryUsage;
            public uint memoryUtilization;
            [MarshalAs(UnmanagedType.ByValArray, SizeConst = 5)]
            public uint[] reserved;
            public ulong startTime;
            public ulong time;
        }
        public enum nvmlReturn_t
        {
            NVML_SUCCESS = 0,                           // The operation was successful. 
            NVML_ERROR_UNINITIALIZED = 1,               // NVML was not first initialized with nvmlInit(). 
            NVML_ERROR_INVALID_ARGUMENT = 2,            // A supplied argument is invalid. 
            NVML_ERROR_NOT_SUPPORTED = 3,               // The requested operation is not available on target device. 
            NVML_ERROR_NO_PERMISSION = 4,               // The current user does not have permission for operation. 
            NVML_ERROR_ALREADY_INITIALIZED = 5,         // Deprecated: Multiple initializations are now allowed through ref counting. 
            NVML_ERROR_NOT_FOUND = 6,                   // A query to find an object was unsuccessful. 
            NVML_ERROR_INSUFFICIENT_SIZE = 7,           // An input argument is not large enough. 
            NVML_ERROR_INSUFFICIENT_POWER = 8,          // A device's external power cables are not properly attached. 
            NVML_ERROR_DRIVER_NOT_LOADED = 9,           // NVIDIA driver is not loaded. 
            NVML_ERROR_TIMEOUT = 10,                    // User provided timeout passed. 
            NVML_ERROR_IRQ_ISSUE = 11,                  // NVIDIA Kernel detected an interrupt issue with a GPU. 
            NVML_ERROR_LIBRARY_NOT_FOUND = 12,          // NVML Shared Library couldn't be found or loaded. 
            NVML_ERROR_FUNCTION_NOT_FOUND = 13,         // Local version of NVML doesn't implement this function. 
            NVML_ERROR_CORRUPTED_INFOROM = 14,          // infoROM is corrupted 
            NVML_ERROR_GPU_IS_LOST = 15,                // The GPU has fallen off the bus or has otherwise become inaccessible. 
            NVML_ERROR_RESET_REQUIRED = 16,             // The GPU requires a reset before it can be used again. 
            NVML_ERROR_OPERATING_SYSTEM = 17,           // The GPU control device has been blocked by the operating system/cgroups. 
            NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,    // RM detects a driver/library version mismatch. 
            NVML_ERROR_IN_USE = 19,                     // An operation cannot be performed because the GPU is currently in use. 
            NVML_ERROR_MEMORY = 20,                     // Insufficient memory. 
            NVML_ERROR_NO_DATA = 21,                    // No data. 
            NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22,     // The requested vgpu operation is not available on target device, becasue ECC is enabled. 
            NVML_ERROR_INSUFFICIENT_RESOURCES = 23,     // Ran out of critical resources, other than memory. 
            NVML_ERROR_FREQ_NOT_SUPPORTED = 24,         // Ran out of critical resources, other than memory. 
            NVML_ERROR_UNKNOWN = 999                    // An internal driver error occurred. 
        }
        [DllImport(NVMLDLL)]
        private static extern nvmlReturn_t nvmlInit_v2();
        [DllImport(NVMLDLL)]
        private static extern nvmlReturn_t nvmlShutdown();
        [DllImport(NVMLDLL)]
        private static extern nvmlReturn_t nvmlDeviceGetCount_v2(out uint deviceCount);
        [DllImport(NVMLDLL)]
        private static extern nvmlReturn_t nvmlDeviceGetHandleByIndex_v2(int index, out IntPtr handle);
        [DllImport(NVMLDLL)]
        private static extern nvmlReturn_t nvmlDeviceGetAccountingPids(IntPtr device, ref uint count, uint[]? pids);
        [DllImport(NVMLDLL)]
        private static extern nvmlReturn_t nvmlDeviceGetAccountingStats(IntPtr device, uint pid, out nvmlAccountingStats_t stats);

        static void Main(string[] args)
        {
            if (nvmlInit_v2() == nvmlReturn_t.NVML_SUCCESS)
            {
                nvmlDeviceGetCount_v2(out uint deviceCount);
                for (int i = 0; i < deviceCount; i++)
                {
                    nvmlDeviceGetHandleByIndex_v2(i, out IntPtr _device);
                    // get the pids
                    uint count = 0;
                    nvmlDeviceGetAccountingPids(_device, ref count, null);
                    uint[] pids = new uint[count];
                    if (nvmlDeviceGetAccountingPids(_device, ref count, pids) == nvmlReturn_t.NVML_SUCCESS)
                    {
                        foreach (uint pid in pids)
                        { 
                            nvmlDeviceGetAccountingStats(_device, pid, out nvmlAccountingStats_t stats);
                            Console.WriteLine($"pid: {pid}, isRunning: {stats.isRunning}");
                        }

                    }

                }
                nvmlShutdown();
            }
        }
    }

at first I saw I had some data back and was excited but I checked the isRunning results and they were thus:

pid: 6560, isRunning: 2
pid: 31636, isRunning: 0
pid: 30000, isRunning: 0
pid: 4164, isRunning: 0
pid: 3368, isRunning: 0
pid: 30420, isRunning: 0
pid: 732, isRunning: 0
pid: 14576, isRunning: 0
pid: 27352, isRunning: 0
pid: 25632, isRunning: 0
pid: 7352, isRunning: 0
pid: 32960, isRunning: 0
pid: 25380, isRunning: 30
pid: 21376, isRunning: 0
pid: 21396, isRunning: 0
pid: 7488, isRunning: 0
pid: 39200, isRunning: 0
pid: 15536, isRunning: 3

and for a value that's supposed to be only a 1 or 0, seeing a 30 and a 3 made me scratch my head ^_^ The gpuutilization values were also off but the isRunning is more noticeable lol

So if anyone knows the correct form that the struct should take, or if I'm not passing it correctly to the function I'd sure appreciate any help ^_^ thank you very much

Updated with answer thanks to @Swift - Friday Pie: I learned a lesson that I should read the implementation file itself if possible rather than referring to the documentation and assuming it's showing the correct order =)

        [StructLayout(LayoutKind.Sequential)]
        public struct nvmlAccountingStats_t
        {
            public uint gpuUtilization;
            public uint memoryUtilization;
            public ulong maxMemoryUsage;
            public ulong time;
            public ulong startTime;
            public uint isRunning;
            [MarshalAs(UnmanagedType.ByValArray, SizeConst = 5)]
            public uint[] reserved;
        }

the updated c# struct shape is returning the information expected that I can see so far(isRunning is only 1 or 0!), thank you for the help!

6
  • You don't check the return value when you call the function to get the stats. In fact there are 4 calls where the return vakue is not checked. Commented Sep 9, 2022 at 6:31
  • @DavidHeffernan yea this is specifically a minimum repeatable example to show the issue, there's (almost)) no error checking, it's separate entirely from the final project but I didn't wanna spam SO with the nonessentials to the question =) Commented Sep 9, 2022 at 7:03
  • That's a mistake. We need to see that the functions are successful since that is the most common explanation in a question like this. Commented Sep 9, 2022 at 7:17
  • I appreciate your opinion on that, thankfully the individual who posted about 4 minutes before your initial comment seems to have been able to deduce the answer as they were able to consider something else other than "he didn't check for an error in this minimum repeatable example" which I had pulled from other code(other calls to the same library successful) and quickly happened upon what appears to be at bare minimum a step in the right direction and something I can explore more fully. Thank you for browsing to attempt to answer though :-) Commented Sep 9, 2022 at 7:33
  • Please post answers as an answer, not as part of the question Commented Sep 9, 2022 at 9:23

1 Answer 1

2

Documentation doesn't describe actual layout of structure (blame NVIDIA), it's an alphabetically sorted list. The real implementation in nvml.h looks like:

typedef struct nvmlAccountingStats_st {
    unsigned int gpuUtilization;                //!< Percent of time over the process's lifetime during which one or more kernels was executing on the GPU.
                                                //! Utilization stats just like returned by \ref nvmlDeviceGetUtilizationRates but for the life time of a
                                                //! process (not just the last sample period).
                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported

    unsigned int memoryUtilization;             //!< Percent of time over the process's lifetime during which global (device) memory was being read or written.
                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlDeviceGetUtilizationRates is not supported

    unsigned long long maxMemoryUsage;          //!< Maximum total memory in bytes that was ever allocated by the process.
                                                //! Set to NVML_VALUE_NOT_AVAILABLE if nvmlProcessInfo_t->usedGpuMemory is not supported


    unsigned long long time;                    //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if
                                                //!< the process is not terminated

    unsigned long long startTime;               //!< CPU Timestamp in usec representing start time for the process

    unsigned int isRunning;                     //!< Flag to represent if the process is running (1 for running, 0 for terminated)

    unsigned int reserved[5];                   //!< Reserved for future use
} nvmlAccountingStats_t;

That's without accounting that C# might have different memory layout with same declaration, it's implementation-defined.

Sign up to request clarification or add additional context in comments.

2 Comments

updated to ensure the struct is in the same order and it's layout sequentially, everything now seems to be mostly as expected but it's the right direction! Thanks for that =) learned a practical lesson with that one lol
@codingNewb How not to write docs? Yeah, they are not doing great job, it's simplistic auto generated page, but declaration COULD be included. E.g. in POSIX on in Windows API all structures are documented as their declarations. Hiding implementation like that implies they don't care about updating documentation if that changes. I assume there might be padding problem.. how C# pads structures in this case?

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.