From d2de2703e42ab0d7875ce615c2c5ecd29a4d90ca Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Tue, 24 Mar 2026 10:55:33 +0000 Subject: [PATCH] Fix HIP and HIPRTC Python bindings for correctness, add missing APIs and tests Built with Claude (claude-4.6-opus) via Cursor. hip.py: - Rewrite hipDeviceProperties struct to match hipDeviceProp_tR0600 layout (100+ fields reordered/added including hipUUID, hipDeviceArch, arrays) - Use hipGetDevicePropertiesR0600 symbol to match C++ macro redefinition - Fix hipMemoryType enum values (hipMemoryTypeArray=10, Unified=11, add Unregistered=0, Managed=3) - Fix hipPointerAttributes struct (add isManaged, allocationFlags; rename memoryType to type) - Fix hipMallocPitch to treat pitch as output parameter - Fix hipError to handle string arguments without crashing - Add argtypes for hipPointerGetAttributes, hipMemGetInfo, hipEventCreateWithFlags (c_uint for flags) - Clean up hipModuleLaunchKernel (remove dead code, rename params) - Add 16 missing error codes to knownExceptions hiprtc.py: - Fix hiprtcGetProgramLog/hiprtcGetCode to use create_string_buffer instead of writing into immutable Python bytes objects - Fix hiprtcCheckStatus crash on unknown error codes - Fix hiprtcGetErrorString to return decoded string - Add HIPRTC_ERROR_LINKING=100 to knownExceptions - Add missing API bindings: hiprtcVersion, hiprtcGetLoweredName, hiprtcGetBitcode/hiprtcGetBitcodeSize tests: - Add test_device_management.py (hipInit, Get/SetDevice, Synchronize) - Add test_device_properties.py (21 field validations) - Add test_memory_advanced.py (MemGetInfo, MallocPitch, PointerAttributes) - Add test_event_flags.py (flag variants, timing-disabled events) - Add test_module.py (load/unload, get function, kernel launch, streams) - Add test_error_handling.py (error class, known/unknown codes, strings) Made-with: Cursor --- pyhip/hip.py | 408 +++++++++++++++++--------------- pyhip/hiprtc.py | 146 ++++++++++-- tests/test_device_management.py | 38 +++ tests/test_device_properties.py | 108 +++++++++ tests/test_error_handling.py | 53 +++++ tests/test_event_flags.py | 47 ++++ tests/test_memory_advanced.py | 53 +++++ tests/test_module.py | 147 ++++++++++++ 8 files changed, 783 insertions(+), 217 deletions(-) create mode 100644 tests/test_device_management.py create mode 100644 tests/test_device_properties.py create mode 100644 tests/test_error_handling.py create mode 100644 tests/test_event_flags.py create mode 100644 tests/test_memory_advanced.py create mode 100644 tests/test_module.py diff --git a/pyhip/hip.py b/pyhip/hip.py index 37d64f9..0ae61b8 100644 --- a/pyhip/hip.py +++ b/pyhip/hip.py @@ -116,70 +116,88 @@ class hipError(Exception): """hip error""" def __init__(self, error=0) -> None: - super().__init__(hipGetErrorString(error)) + if isinstance(error, int): + super().__init__(hipGetErrorString(error)) + else: + super().__init__(str(error)) knownExceptions = set( [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 13, - 17, - 21, - 35, - 52, - 53, - 98, - 100, - 101, - 200, - 201, - 202, - 205, - 206, - 207, - 208, - 209, - 210, - 211, - 212, - 213, - 214, - 215, - 216, - 217, - 218, - 219, - 300, - 301, - 302, - 303, - 304, - 400, - 500, - 600, - 700, - 701, - 702, - 704, - 705, - 708, - 710, - 712, - 713, - 719, - 720, - 801, - 999, - 1052, - 1053, + 1, # hipErrorInvalidValue + 2, # hipErrorOutOfMemory / hipErrorMemoryAllocation + 3, # hipErrorNotInitialized / hipErrorInitializationError + 4, # hipErrorDeinitialized + 5, # hipErrorProfilerDisabled + 6, # hipErrorProfilerNotInitialized + 7, # hipErrorProfilerAlreadyStarted + 8, # hipErrorProfilerAlreadyStopped + 9, # hipErrorInvalidConfiguration + 12, # hipErrorInvalidPitchValue + 13, # hipErrorInvalidSymbol + 17, # hipErrorInvalidDevicePointer + 21, # hipErrorInvalidMemcpyDirection + 35, # hipErrorInsufficientDriver + 52, # hipErrorMissingConfiguration + 53, # hipErrorPriorLaunchFailure + 98, # hipErrorInvalidDeviceFunction + 100, # hipErrorNoDevice + 101, # hipErrorInvalidDevice + 200, # hipErrorInvalidImage + 201, # hipErrorInvalidContext + 202, # hipErrorContextAlreadyCurrent + 205, # hipErrorMapFailed / hipErrorMapBufferObjectFailed + 206, # hipErrorUnmapFailed + 207, # hipErrorArrayIsMapped + 208, # hipErrorAlreadyMapped + 209, # hipErrorNoBinaryForGpu + 210, # hipErrorAlreadyAcquired + 211, # hipErrorNotMapped + 212, # hipErrorNotMappedAsArray + 213, # hipErrorNotMappedAsPointer + 214, # hipErrorECCNotCorrectable + 215, # hipErrorUnsupportedLimit + 216, # hipErrorContextAlreadyInUse + 217, # hipErrorPeerAccessUnsupported + 218, # hipErrorInvalidKernelFile + 219, # hipErrorInvalidGraphicsContext + 300, # hipErrorInvalidSource + 301, # hipErrorFileNotFound + 302, # hipErrorSharedObjectSymbolNotFound + 303, # hipErrorSharedObjectInitFailed + 304, # hipErrorOperatingSystem + 400, # hipErrorInvalidHandle / hipErrorInvalidResourceHandle + 401, # hipErrorIllegalState + 500, # hipErrorNotFound + 600, # hipErrorNotReady + 700, # hipErrorIllegalAddress + 701, # hipErrorLaunchOutOfResources + 702, # hipErrorLaunchTimeOut + 704, # hipErrorPeerAccessAlreadyEnabled + 705, # hipErrorPeerAccessNotEnabled + 708, # hipErrorSetOnActiveProcess + 709, # hipErrorContextIsDestroyed + 710, # hipErrorAssert + 712, # hipErrorHostMemoryAlreadyRegistered + 713, # hipErrorHostMemoryNotRegistered + 719, # hipErrorLaunchFailure + 720, # hipErrorCooperativeLaunchTooLarge + 801, # hipErrorNotSupported + 900, # hipErrorStreamCaptureUnsupported + 901, # hipErrorStreamCaptureInvalidated + 902, # hipErrorStreamCaptureMerge + 903, # hipErrorStreamCaptureUnmatched + 904, # hipErrorStreamCaptureUnjoined + 905, # hipErrorStreamCaptureIsolation + 906, # hipErrorStreamCaptureImplicit + 907, # hipErrorCapturedEvent + 908, # hipErrorStreamCaptureWrongThread + 910, # hipErrorGraphExecUpdateFailure + 911, # hipErrorInvalidChannelDescriptor + 912, # hipErrorInvalidTexture + 999, # hipErrorUnknown + 1052, # hipErrorRuntimeMemory + 1053, # hipErrorRuntimeOther ] ) @@ -296,7 +314,7 @@ def hipStreamSynchronize(ptr): _libhip.hipEventCreateWithFlags.restype = int _libhip.hipEventCreateWithFlags.argtypes = [ ctypes.POINTER(ctypes.c_void_p), - ctypes.c_int, + ctypes.c_uint, ] @@ -544,7 +562,7 @@ def hipFree(ptr): ] -def hipMallocPitch(pitch, rows, cols, elesize): +def hipMallocPitch(rows, cols, elesize): """ Allocate pitched device memory. @@ -553,8 +571,6 @@ def hipMallocPitch(pitch, rows, cols, elesize): Parameters ---------- - pitch : int - Pitch for allocation. rows : int Requested pitched allocation height. cols : int @@ -566,15 +582,18 @@ def hipMallocPitch(pitch, rows, cols, elesize): ------- ptr : ctypes pointer Pointer to allocated device memory. + pitch : int + Pitch in bytes chosen by the runtime. """ ptr = ctypes.c_void_p() + pitch = ctypes.c_size_t(0) status = _libhip.hipMallocPitch( - ctypes.byref(ptr), ctypes.c_size_t(pitch), cols * elesize, rows + ctypes.byref(ptr), ctypes.byref(pitch), cols * elesize, rows ) hipCheckStatus(status) - return ptr, pitch + return ptr, pitch.value _libhip.hipMemset.restype = ctypes.c_int @@ -744,7 +763,8 @@ def hipMemcpyAsync(dst, src, count, direction, stream): _libhip.hipMemGetInfo.restype = int -_libhip.hipMemGetInfo.argtypes = [ctypes.c_void_p, ctypes.c_void_p] +_libhip.hipMemGetInfo.argtypes = [ + ctypes.POINTER(ctypes.c_size_t), ctypes.POINTER(ctypes.c_size_t)] def hipMemGetInfo(): @@ -857,128 +877,123 @@ class hipDeviceArch(ctypes.Structure): ] +class hipUUID(ctypes.Structure): + _fields_ = [ + ("bytes", ctypes.c_char * 16), + ] + + class hipDeviceProperties(ctypes.Structure): + """Matches hipDeviceProp_tR0600 layout from hip_runtime_api.h""" _fields_ = [ - # Device name ("_name", ctypes.c_char * 256), - # Size of global memory region (in bytes) + ("uuid", hipUUID), + ("luid", ctypes.c_char * 8), + ("luidDeviceNodeMask", ctypes.c_uint), ("totalGlobalMem", ctypes.c_size_t), - # Size of shared memory region (in bytes). ("sharedMemPerBlock", ctypes.c_size_t), - # Registers per block. ("regsPerBlock", ctypes.c_int), - # Warp size. ("warpSize", ctypes.c_int), - # Max work items per work group or workgroup max size. + ("memPitch", ctypes.c_size_t), ("maxThreadsPerBlock", ctypes.c_int), - # Max number of threads in each dimension (XYZ) of a block. ("maxThreadsDim", ctypes.c_int * 3), - # Max grid dimensions (XYZ). ("maxGridSize", ctypes.c_int * 3), - # Max clock frequency of the multiProcessors in khz. ("clockRate", ctypes.c_int), - # Max global memory clock frequency in khz. - ("memoryClockRate", ctypes.c_int), - # Global memory bus width in bits. - ("memoryBusWidth", ctypes.c_int), - # Size of shared memory region (in bytes). ("totalConstMem", ctypes.c_size_t), - # Major compute capability. On HCC, this is an approximation and features may - # differ from CUDA CC. See the arch feature flags for portable ways to query - # feature caps. ("major", ctypes.c_int), - # Minor compute capability. On HCC, this is an approximation and features may - # differ from CUDA CC. See the arch feature flags for portable ways to query - # feature caps. ("minor", ctypes.c_int), - # Number of multi-processors (compute units). + ("textureAlignment", ctypes.c_size_t), + ("texturePitchAlignment", ctypes.c_size_t), + ("deviceOverlap", ctypes.c_int), ("multiProcessorCount", ctypes.c_int), - # L2 cache size. - ("l2CacheSize", ctypes.c_int), - # Maximum resident threads per multi-processor. - ("maxThreadsPerMultiProcessor", ctypes.c_int), - # Compute mode. + ("kernelExecTimeoutEnabled", ctypes.c_int), + ("integrated", ctypes.c_int), + ("canMapHostMemory", ctypes.c_int), ("computeMode", ctypes.c_int), - # Frequency in khz of the timer used by the device-side "clock*" - # instructions. New for HIP. - ("clockInstructionRate", ctypes.c_int), - # Architectural feature flags. New for HIP. - ("arch", hipDeviceArch), - # Device can possibly execute multiple kernels concurrently. + ("maxTexture1D", ctypes.c_int), + ("maxTexture1DMipmap", ctypes.c_int), + ("maxTexture1DLinear", ctypes.c_int), + ("maxTexture2D", ctypes.c_int * 2), + ("maxTexture2DMipmap", ctypes.c_int * 2), + ("maxTexture2DLinear", ctypes.c_int * 3), + ("maxTexture2DGather", ctypes.c_int * 2), + ("maxTexture3D", ctypes.c_int * 3), + ("maxTexture3DAlt", ctypes.c_int * 3), + ("maxTextureCubemap", ctypes.c_int), + ("maxTexture1DLayered", ctypes.c_int * 2), + ("maxTexture2DLayered", ctypes.c_int * 3), + ("maxTextureCubemapLayered", ctypes.c_int * 2), + ("maxSurface1D", ctypes.c_int), + ("maxSurface2D", ctypes.c_int * 2), + ("maxSurface3D", ctypes.c_int * 3), + ("maxSurface1DLayered", ctypes.c_int * 2), + ("maxSurface2DLayered", ctypes.c_int * 3), + ("maxSurfaceCubemap", ctypes.c_int), + ("maxSurfaceCubemapLayered", ctypes.c_int * 2), + ("surfaceAlignment", ctypes.c_size_t), ("concurrentKernels", ctypes.c_int), - # PCI Domain ID - ("pciDomainID", ctypes.c_int), - # PCI Bus ID. + ("ECCEnabled", ctypes.c_int), ("pciBusID", ctypes.c_int), - # PCI Device ID. ("pciDeviceID", ctypes.c_int), - # Maximum Shared Memory Per Multiprocessor. - ("maxSharedMemoryPerMultiProcessor", ctypes.c_size_t), - # 1 if device is on a multi-GPU board, 0 if not. + ("pciDomainID", ctypes.c_int), + ("tccDriver", ctypes.c_int), + ("asyncEngineCount", ctypes.c_int), + ("unifiedAddressing", ctypes.c_int), + ("memoryClockRate", ctypes.c_int), + ("memoryBusWidth", ctypes.c_int), + ("l2CacheSize", ctypes.c_int), + ("persistingL2CacheMaxSize", ctypes.c_int), + ("maxThreadsPerMultiProcessor", ctypes.c_int), + ("streamPrioritiesSupported", ctypes.c_int), + ("globalL1CacheSupported", ctypes.c_int), + ("localL1CacheSupported", ctypes.c_int), + ("sharedMemPerMultiprocessor", ctypes.c_size_t), + ("regsPerMultiprocessor", ctypes.c_int), + ("managedMemory", ctypes.c_int), ("isMultiGpuBoard", ctypes.c_int), - # Check whether HIP can map host memory - ("canMapHostMemory", ctypes.c_int), - # DEPRECATED: use gcnArchName instead - ("gcnArch", ctypes.c_int), - # AMD GCN Arch Name. - ("_gcnArchName", ctypes.c_char * 256), - # APU vs dGPU - ("integrated", ctypes.c_int), - # HIP device supports cooperative launch + ("multiGpuBoardGroupID", ctypes.c_int), + ("hostNativeAtomicSupported", ctypes.c_int), + ("singleToDoublePrecisionPerfRatio", ctypes.c_int), + ("pageableMemoryAccess", ctypes.c_int), + ("concurrentManagedAccess", ctypes.c_int), + ("computePreemptionSupported", ctypes.c_int), + ("canUseHostPointerForRegisteredMem", ctypes.c_int), ("cooperativeLaunch", ctypes.c_int), - # HIP device supports cooperative launch on multiple devices ("cooperativeMultiDeviceLaunch", ctypes.c_int), - # Maximum size for 1D textures bound to linear memory - ("maxTexture1DLinear", ctypes.c_int), - # Maximum number of elements in 1D images - ("maxTexture1D", ctypes.c_int), - # Maximum dimensions (width, height) of 2D images, in image elements - ("maxTexture2D", ctypes.c_int * 2), - # Maximum dimensions (width, height, depth) of 3D images, in image elements - ("maxTexture3D", ctypes.c_int * 3), - # Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register + ("sharedMemPerBlockOptin", ctypes.c_size_t), + ("pageableMemoryAccessUsesHostPageTables", ctypes.c_int), + ("directManagedMemAccessFromHost", ctypes.c_int), + ("maxBlocksPerMultiProcessor", ctypes.c_int), + ("accessPolicyMaxWindowSize", ctypes.c_int), + ("reservedSharedMemPerBlock", ctypes.c_size_t), + ("hostRegisterSupported", ctypes.c_int), + ("sparseHipArraySupported", ctypes.c_int), + ("hostRegisterReadOnlySupported", ctypes.c_int), + ("timelineSemaphoreInteropSupported", ctypes.c_int), + ("memoryPoolsSupported", ctypes.c_int), + ("gpuDirectRDMASupported", ctypes.c_int), + ("gpuDirectRDMAFlushWritesOptions", ctypes.c_uint), + ("gpuDirectRDMAWritesOrdering", ctypes.c_int), + ("memoryPoolSupportedHandleTypes", ctypes.c_uint), + ("deferredMappingHipArraySupported", ctypes.c_int), + ("ipcEventSupported", ctypes.c_int), + ("clusterLaunch", ctypes.c_int), + ("unifiedFunctionPointers", ctypes.c_int), + ("_reserved", ctypes.c_int * 63), + ("_hipReserved", ctypes.c_int * 32), + # HIP Only struct members + ("_gcnArchName", ctypes.c_char * 256), + ("maxSharedMemoryPerMultiProcessor", ctypes.c_size_t), + ("clockInstructionRate", ctypes.c_int), + ("arch", hipDeviceArch), ("hdpMemFlushCntl", POINTER(ctypes.c_uint)), - # Addres of HDP_REG_COHERENCY_FLUSH_CNTL register ("hdpRegFlushCntl", POINTER(ctypes.c_uint)), - # Maximum pitch in bytes allowed by memory copies - ("memPitch", ctypes.c_size_t), - # Alignment requirement for textures - ("textureAlignment", ctypes.c_size_t), - # Pitch alignment requirement for texture references bound to pitched memory - ("texturePitchAlignment", ctypes.c_size_t), - # Run time limit for kernels executed on the device - ("kernelExecTimeoutEnabled", ctypes.c_int), - # Device has ECC support enabled - ("ECCEnabled", ctypes.c_int), - # 1:If device is Tesla device using TCC driver, else 0 - ("tccDriver", ctypes.c_int), - # HIP device supports cooperative launch on multiple - # devices with unmatched functions ("cooperativeMultiDeviceUnmatchedFunc", ctypes.c_int), - # HIP device supports cooperative launch on multiple - # devices with unmatched grid dimensions ("cooperativeMultiDeviceUnmatchedGridDim", ctypes.c_int), - # HIP device supports cooperative launch on multiple - # devices with unmatched block dimensions ("cooperativeMultiDeviceUnmatchedBlockDim", ctypes.c_int), - # HIP device supports cooperative launch on multiple - # devices with unmatched shared memories ("cooperativeMultiDeviceUnmatchedSharedMem", ctypes.c_int), - # 1: if it is a large PCI bar device, else 0 ("isLargeBar", ctypes.c_int), - # Revision of the GPU in this device ("asicRevision", ctypes.c_int), - # Device supports allocating managed memory on this system - ("managedMemory", ctypes.c_int), - # Host can directly access managed memory on the device without migration - ("directManagedMemAccessFromHost", ctypes.c_int), - # Device can coherently access managed memory concurrently with the CPU - ("concurrentManagedAccess", ctypes.c_int), - # Device supports coherently accessing pageable memory - # without calling hipHostRegister on it - ("pageableMemoryAccess", ctypes.c_int), - # Device accesses pageable memory via the host's page tables - ("pageableMemoryAccessUsesHostPageTables", ctypes.c_int), ] @property @@ -990,8 +1005,8 @@ def gcnArchName(self): return self._gcnArchName.decode("utf-8") -_libhip.hipGetDeviceProperties.restype = int -_libhip.hipGetDeviceProperties.argtypes = [ +_libhip.hipGetDevicePropertiesR0600.restype = int +_libhip.hipGetDevicePropertiesR0600.argtypes = [ POINTER(hipDeviceProperties), ctypes.c_int] @@ -1009,30 +1024,35 @@ def hipGetDeviceProperties(deviceId: int): Information for the specified device """ device_properties = hipDeviceProperties() - status = _libhip.hipGetDeviceProperties( + status = _libhip.hipGetDevicePropertiesR0600( ctypes.pointer(device_properties), deviceId) hipCheckStatus(status) return device_properties -# Memory types: +# Memory types (hipMemoryType enum): +hipMemoryTypeUnregistered = 0 hipMemoryTypeHost = 1 hipMemoryTypeDevice = 2 -hipMemoryTypeArray = 3 -hipMemoryTypeUnified = 4 # Not used currently +hipMemoryTypeManaged = 3 +hipMemoryTypeArray = 10 +hipMemoryTypeUnified = 11 class hipPointerAttributes(ctypes.Structure): _fields_ = [ - ("memoryType", ctypes.c_int), + ("type", ctypes.c_int), ("device", ctypes.c_int), ("devicePointer", ctypes.c_void_p), ("hostPointer", ctypes.c_void_p), + ("isManaged", ctypes.c_int), + ("allocationFlags", ctypes.c_uint), ] _libhip.hipPointerGetAttributes.restype = int -_libhip.hipPointerGetAttributes.argtypes = [ctypes.c_void_p, ctypes.c_void_p] +_libhip.hipPointerGetAttributes.argtypes = [ + POINTER(hipPointerAttributes), ctypes.c_void_p] def hipPointerGetAttributes(ptr): @@ -1058,7 +1078,7 @@ def hipPointerGetAttributes(ptr): attributes = hipPointerAttributes() status = _libhip.hipPointerGetAttributes(ctypes.byref(attributes), ptr) hipCheckStatus(status) - return attributes.memoryType, attributes.device + return attributes.type, attributes.device _libhip.hipModuleLoadData.restype = int @@ -1200,7 +1220,9 @@ def hipModuleUnload(module): ] # extra -def hipModuleLaunchKernel(kernel, bx, by, bz, tx, ty, tz, shared, stream, struct): +def hipModuleLaunchKernel(kernel, gridDimX, gridDimY, gridDimZ, + blockDimX, blockDimY, blockDimZ, + sharedMemBytes, stream, struct): """ Launch the kernel @@ -1208,34 +1230,33 @@ def hipModuleLaunchKernel(kernel, bx, by, bz, tx, ty, tz, shared, stream, struct ---------- kernel : ctypes ptr kernel from loaded module - bx : int - dim x - by : int - dim y - bz : int - dim z - tx : int - dim x - ty : int - dim y - tz : int - dim z - shared : int - shared mem + gridDimX : int + grid dimension x + gridDimY : int + grid dimension y + gridDimZ : int + grid dimension z + blockDimX : int + block dimension x + blockDimY : int + block dimension y + blockDimZ : int + block dimension z + sharedMemBytes : int + shared memory in bytes stream : ctype void ptr stream object struct : ctypes structure struct of packed up arguments of kernel """ - c_bx = ctypes.c_uint(bx) - c_by = ctypes.c_uint(by) - c_bz = ctypes.c_uint(bz) - c_tx = ctypes.c_uint(tx) - c_ty = ctypes.c_uint(ty) - c_tz = ctypes.c_uint(tz) - c_shared = ctypes.c_uint(shared) + c_gridDimX = ctypes.c_uint(gridDimX) + c_gridDimY = ctypes.c_uint(gridDimY) + c_gridDimZ = ctypes.c_uint(gridDimZ) + c_blockDimX = ctypes.c_uint(blockDimX) + c_blockDimY = ctypes.c_uint(blockDimY) + c_blockDimZ = ctypes.c_uint(blockDimZ) + c_sharedMemBytes = ctypes.c_uint(sharedMemBytes) - ctypes.sizeof(struct) hip_launch_param_buffer_ptr = ctypes.c_void_p(1) hip_launch_param_buffer_size = ctypes.c_void_p(2) hip_launch_param_buffer_end = ctypes.c_void_p(0) @@ -1251,10 +1272,11 @@ def hipModuleLaunchKernel(kernel, bx, by, bz, tx, ty, tz, shared, stream, struct p_size, hip_launch_param_buffer_end, ) - nullptr = ctypes.POINTER(ctypes.c_void_p)(ctypes.c_void_p(0)) status = _libhip.hipModuleLaunchKernel( - kernel, c_bx, c_by, c_bz, c_tx, c_ty, c_tz, c_shared, stream, None, config + kernel, c_gridDimX, c_gridDimY, c_gridDimZ, + c_blockDimX, c_blockDimY, c_blockDimZ, + c_sharedMemBytes, stream, None, config ) hipCheckStatus(status) diff --git a/pyhip/hiprtc.py b/pyhip/hiprtc.py index 97782f2..22435ed 100644 --- a/pyhip/hiprtc.py +++ b/pyhip/hiprtc.py @@ -77,32 +77,37 @@ def hiprtcGetErrorString(e): """ - return _libhiprtc.hiprtcGetErrorString(e) + error = _libhiprtc.hiprtcGetErrorString(e) + return str(error) # Generic hiprtc Error class hiprtcError(Exception): - """hip error""" + """hiprtc error""" def __init__(self, error=0) -> None: - super().__init__(hiprtcGetErrorString(error)) + if isinstance(error, int): + super().__init__(hiprtcGetErrorString(error)) + else: + super().__init__(str(error)) knownExceptions = set( [ - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - 10, - 11, + 1, # HIPRTC_ERROR_OUT_OF_MEMORY + 2, # HIPRTC_ERROR_PROGRAM_CREATION_FAILURE + 3, # HIPRTC_ERROR_INVALID_INPUT + 4, # HIPRTC_ERROR_INVALID_PROGRAM + 5, # HIPRTC_ERROR_INVALID_OPTION + 6, # HIPRTC_ERROR_COMPILATION + 7, # HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE + 8, # HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION + 9, # HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION + 10, # HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID + 11, # HIPRTC_ERROR_INTERNAL_ERROR + 100, # HIPRTC_ERROR_LINKING ] ) @@ -294,11 +299,10 @@ def hiprtcGetProgramLog(prog): status = _libhiprtc.hiprtcGetProgramLogSize(prog, ctypes.byref(log_size)) hiprtcCheckStatus(status) - log = "0" * log_size.value - e_log = log.encode("utf-8") - status = _libhiprtc.hiprtcGetProgramLog(prog, e_log) + log_buf = ctypes.create_string_buffer(log_size.value) + status = _libhiprtc.hiprtcGetProgramLog(prog, log_buf) hiprtcCheckStatus(status) - return e_log.decode("utf-8") + return log_buf.value.decode("utf-8") _libhiprtc.hiprtcGetCodeSize.restype = int @@ -324,15 +328,109 @@ def hiprtcGetCode(prog): Returns ------- - code : string - hiprtc module code + code : bytes + hiprtc compiled binary code """ code_size = ctypes.c_size_t() status = _libhiprtc.hiprtcGetCodeSize(prog, ctypes.byref(code_size)) hiprtcCheckStatus(status) - code = "0" * code_size.value - e_code = code.encode("utf-8") - status = _libhiprtc.hiprtcGetCode(prog, e_code) + code_buf = ctypes.create_string_buffer(code_size.value) + status = _libhiprtc.hiprtcGetCode(prog, code_buf) + hiprtcCheckStatus(status) + return code_buf.raw + + +_libhiprtc.hiprtcVersion.restype = int +_libhiprtc.hiprtcVersion.argtypes = [ + ctypes.POINTER(ctypes.c_int), + ctypes.POINTER(ctypes.c_int), +] + + +def hiprtcVersion(): + """ + Returns the hiprtc major and minor version. + + Returns + ------- + major : int + HIP Runtime Compilation major version. + minor : int + HIP Runtime Compilation minor version. + """ + major = ctypes.c_int(0) + minor = ctypes.c_int(0) + status = _libhiprtc.hiprtcVersion( + ctypes.byref(major), ctypes.byref(minor)) + hiprtcCheckStatus(status) + return major.value, minor.value + + +_libhiprtc.hiprtcGetLoweredName.restype = int +_libhiprtc.hiprtcGetLoweredName.argtypes = [ + ctypes.c_void_p, # hiprtcProgram + ctypes.POINTER(ctypes.c_char), # name_expression + ctypes.POINTER(ctypes.c_char_p), # lowered_name +] + + +def hiprtcGetLoweredName(prog, name_expression): + """ + Gets the lowered (mangled) name from an instance of hiprtcProgram. + + Parameters + ---------- + prog : ctypes pointer + hiprtc program handle + name_expression : str + The name expression to look up. + + Returns + ------- + lowered_name : str + The lowered (mangled) name. + """ + e_name = name_expression.encode("utf-8") + lowered_name = ctypes.c_char_p() + status = _libhiprtc.hiprtcGetLoweredName( + prog, e_name, ctypes.byref(lowered_name)) + hiprtcCheckStatus(status) + return lowered_name.value.decode("utf-8") + + +_libhiprtc.hiprtcGetBitcodeSize.restype = int +_libhiprtc.hiprtcGetBitcodeSize.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_size_t), +] +_libhiprtc.hiprtcGetBitcode.restype = int +_libhiprtc.hiprtcGetBitcode.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(ctypes.c_char), +] + + +def hiprtcGetBitcode(prog): + """ + Gets the compiled bitcode from the program. + + Parameters + ---------- + prog : ctypes pointer + hiprtc program handle + + Returns + ------- + bitcode : bytes + Compiled bitcode. + """ + bitcode_size = ctypes.c_size_t() + status = _libhiprtc.hiprtcGetBitcodeSize( + prog, ctypes.byref(bitcode_size)) + hiprtcCheckStatus(status) + + bitcode_buf = ctypes.create_string_buffer(bitcode_size.value) + status = _libhiprtc.hiprtcGetBitcode(prog, bitcode_buf) hiprtcCheckStatus(status) - return e_code + return bitcode_buf.raw diff --git a/tests/test_device_management.py b/tests/test_device_management.py new file mode 100644 index 0000000..98bcc97 --- /dev/null +++ b/tests/test_device_management.py @@ -0,0 +1,38 @@ +from pyhip import hip +import unittest + + +class TestDeviceManagement(unittest.TestCase): + def test_hipInit(self): + hip.hipInit(0) + + def test_hipGetDevice(self): + device = hip.hipGetDevice() + self.assertGreaterEqual(device, 0) + + def test_hipSetDevice(self): + device_count = hip.hipGetDeviceCount() + self.assertGreater(device_count, 0) + original_device = hip.hipGetDevice() + for i in range(device_count): + hip.hipSetDevice(i) + current = hip.hipGetDevice() + self.assertEqual(current, i) + hip.hipSetDevice(original_device) + + def test_hipSetDevice_invalid(self): + device_count = hip.hipGetDeviceCount() + with self.assertRaises(hip.hipError): + hip.hipSetDevice(device_count + 100) + + def test_hipDeviceSynchronize(self): + hip.hipDeviceSynchronize() + + def test_hipDeviceSynchronize_after_malloc(self): + ptr = hip.hipMalloc(1024) + hip.hipDeviceSynchronize() + hip.hipFree(ptr) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_device_properties.py b/tests/test_device_properties.py new file mode 100644 index 0000000..3dc26ce --- /dev/null +++ b/tests/test_device_properties.py @@ -0,0 +1,108 @@ +from pyhip import hip +import unittest + + +class TestDeviceProperties(unittest.TestCase): + def setUp(self): + self.device_count = hip.hipGetDeviceCount() + self.assertGreater(self.device_count, 0) + + def test_name_not_empty(self): + props = hip.hipGetDeviceProperties(0) + self.assertIsInstance(props.name, str) + self.assertGreater(len(props.name), 0) + + def test_total_global_mem(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreater(props.totalGlobalMem, 0) + + def test_shared_mem_per_block(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreater(props.sharedMemPerBlock, 0) + + def test_warp_size(self): + props = hip.hipGetDeviceProperties(0) + self.assertIn(props.warpSize, [32, 64]) + + def test_max_threads_per_block(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreater(props.maxThreadsPerBlock, 0) + self.assertLessEqual(props.maxThreadsPerBlock, 2048) + + def test_max_threads_dim(self): + props = hip.hipGetDeviceProperties(0) + for i in range(3): + self.assertGreater(props.maxThreadsDim[i], 0) + + def test_max_grid_size(self): + props = hip.hipGetDeviceProperties(0) + for i in range(3): + self.assertGreater(props.maxGridSize[i], 0) + + def test_clock_rate(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreater(props.clockRate, 0) + + def test_compute_capability(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreater(props.major, 0) + self.assertGreaterEqual(props.minor, 0) + + def test_multi_processor_count(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreater(props.multiProcessorCount, 0) + + def test_l2_cache_size(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreaterEqual(props.l2CacheSize, 0) + + def test_max_threads_per_multi_processor(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreater(props.maxThreadsPerMultiProcessor, 0) + + def test_memory_clock_rate(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreater(props.memoryClockRate, 0) + + def test_memory_bus_width(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreater(props.memoryBusWidth, 0) + + def test_regs_per_block(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreater(props.regsPerBlock, 0) + + def test_gcn_arch_name_on_amd(self): + if hip.hipGetPlatformName() != "amd": + self.skipTest("gcnArchName is AMD-specific") + props = hip.hipGetDeviceProperties(0) + self.assertIsInstance(props.gcnArchName, str) + self.assertGreater(len(props.gcnArchName), 0) + + def test_concurrent_kernels(self): + props = hip.hipGetDeviceProperties(0) + self.assertIn(props.concurrentKernels, [0, 1]) + + def test_all_devices(self): + for i in range(self.device_count): + props = hip.hipGetDeviceProperties(i) + self.assertGreater(len(props.name), 0) + self.assertGreater(props.totalGlobalMem, 0) + + def test_pci_ids(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreaterEqual(props.pciBusID, 0) + self.assertGreaterEqual(props.pciDeviceID, 0) + self.assertGreaterEqual(props.pciDomainID, 0) + + def test_mem_pitch(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreater(props.memPitch, 0) + + def test_texture_alignment(self): + props = hip.hipGetDeviceProperties(0) + self.assertGreater(props.textureAlignment, 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_error_handling.py b/tests/test_error_handling.py new file mode 100644 index 0000000..705b72c --- /dev/null +++ b/tests/test_error_handling.py @@ -0,0 +1,53 @@ +from pyhip import hip +import ctypes +import unittest + + +class TestErrorHandling(unittest.TestCase): + def test_hipError_with_known_code(self): + try: + raise hip.hipError(1) + except hip.hipError as e: + msg = str(e) + self.assertGreater(len(msg), 0) + + def test_hipError_with_string(self): + try: + raise hip.hipError("custom error message") + except hip.hipError as e: + self.assertEqual(str(e), "custom error message") + + def test_hipCheckStatus_success(self): + hip.hipCheckStatus(0) + + def test_hipCheckStatus_known_error(self): + with self.assertRaises(hip.hipError): + hip.hipCheckStatus(1) + + def test_hipCheckStatus_unknown_error(self): + with self.assertRaises(hip.hipError) as ctx: + hip.hipCheckStatus(99999) + self.assertIn("unknown hip error", str(ctx.exception)) + + def test_hipGetErrorString(self): + s = hip.hipGetErrorString(0) + self.assertIsInstance(s, str) + self.assertGreater(len(s), 0) + + def test_hipGetErrorName(self): + name = hip.hipGetErrorName(0) + self.assertIsInstance(name, str) + self.assertGreater(len(name), 0) + + def test_hipFree_invalid_raises(self): + bad_ptr = ctypes.c_void_p(0xDEADBEEF) + with self.assertRaises(hip.hipError): + hip.hipFree(bad_ptr) + + def test_hipMalloc_zero_bytes(self): + ptr = hip.hipMalloc(0) + hip.hipFree(ptr) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_event_flags.py b/tests/test_event_flags.py new file mode 100644 index 0000000..3d2bfbd --- /dev/null +++ b/tests/test_event_flags.py @@ -0,0 +1,47 @@ +from pyhip import hip +import unittest + + +class TestEventFlags(unittest.TestCase): + def test_hipEventCreateWithFlags_default(self): + event = hip.hipEventCreateWithFlags(hip.hipEventDefault) + self.assertIsNotNone(event) + hip.hipEventDestroy(event) + + def test_hipEventCreateWithFlags_blocking_sync(self): + event = hip.hipEventCreateWithFlags(hip.hipEventBlockingSync) + self.assertIsNotNone(event) + hip.hipEventDestroy(event) + + def test_hipEventCreateWithFlags_disable_timing(self): + event = hip.hipEventCreateWithFlags(hip.hipEventDisableTiming) + self.assertIsNotNone(event) + hip.hipEventDestroy(event) + + def test_event_with_flags_record_sync(self): + stream = hip.hipStreamCreate() + event = hip.hipEventCreateWithFlags(hip.hipEventBlockingSync) + hip.hipEventRecord(event, stream) + hip.hipEventSynchronize(event) + self.assertTrue(hip.hipEventQuery(event)) + hip.hipEventDestroy(event) + hip.hipStreamDestroy(stream) + + def test_event_disable_timing_no_elapsed(self): + stream = hip.hipStreamCreate() + start = hip.hipEventCreateWithFlags(hip.hipEventDisableTiming) + end = hip.hipEventCreateWithFlags(hip.hipEventDisableTiming) + hip.hipEventRecord(start, stream) + ptr = hip.hipMalloc(1024) + hip.hipEventRecord(end, stream) + hip.hipEventSynchronize(end) + with self.assertRaises(hip.hipError): + hip.hipEventElapsedTime(start, end) + hip.hipFree(ptr) + hip.hipEventDestroy(start) + hip.hipEventDestroy(end) + hip.hipStreamDestroy(stream) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_memory_advanced.py b/tests/test_memory_advanced.py new file mode 100644 index 0000000..6db1ce4 --- /dev/null +++ b/tests/test_memory_advanced.py @@ -0,0 +1,53 @@ +from pyhip import hip +import ctypes +import unittest + + +class TestMemGetInfo(unittest.TestCase): + def test_hipMemGetInfo(self): + free, total = hip.hipMemGetInfo() + self.assertGreater(total, 0) + self.assertGreater(free, 0) + self.assertGreaterEqual(total, free) + + def test_hipMemGetInfo_after_alloc(self): + free_before, total_before = hip.hipMemGetInfo() + alloc_size = 256 * 1024 * 1024 # 256 MB + ptr = hip.hipMalloc(alloc_size) + free_after, total_after = hip.hipMemGetInfo() + self.assertEqual(total_before, total_after) + self.assertGreaterEqual(free_before, free_after) + hip.hipFree(ptr) + + +class TestMallocPitch(unittest.TestCase): + def test_hipMallocPitch_basic(self): + rows = 64 + cols = 64 + elesize = 4 # sizeof(int) + ptr, pitch = hip.hipMallocPitch(rows, cols, elesize) + self.assertIsNotNone(ptr) + self.assertGreaterEqual(pitch, cols * elesize) + hip.hipFree(ptr) + + def test_hipMallocPitch_returns_aligned_pitch(self): + rows = 100 + cols = 100 + elesize = 4 + ptr, pitch = hip.hipMallocPitch(rows, cols, elesize) + self.assertIsNotNone(ptr) + self.assertEqual(pitch % 128, 0, "pitch should be aligned to 128 bytes") + hip.hipFree(ptr) + + +class TestPointerAttributes(unittest.TestCase): + def test_device_pointer_attributes(self): + ptr = hip.hipMalloc(1024) + mem_type, device = hip.hipPointerGetAttributes(ptr) + self.assertEqual(mem_type, hip.hipMemoryTypeDevice) + self.assertGreaterEqual(device, 0) + hip.hipFree(ptr) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_module.py b/tests/test_module.py new file mode 100644 index 0000000..dccd685 --- /dev/null +++ b/tests/test_module.py @@ -0,0 +1,147 @@ +from pyhip import hip, hiprtc +import ctypes +import unittest + + +class TestModuleLifecycle(unittest.TestCase): + @classmethod + def _compile_kernel(cls): + source = """ + extern "C" __global__ void add_one(int *a, int n) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + a[i] = a[i] + 1; + } + } + """ + prog = hiprtc.hiprtcCreateProgram(source, "add_one", [], []) + device_properties = hip.hipGetDeviceProperties(0) + if hip.hipGetPlatformName() == "amd": + hiprtc.hiprtcCompileProgram( + prog, [f"--offload-arch={device_properties.gcnArchName}"] + ) + else: + hiprtc.hiprtcCompileProgram(prog, []) + code = hiprtc.hiprtcGetCode(prog) + hiprtc.hiprtcDestroyProgram(prog) + return code + + def test_module_load_unload(self): + code = self._compile_kernel() + module = hip.hipModuleLoadData(code) + self.assertIsNotNone(module) + hip.hipModuleUnload(module) + + def test_module_get_function(self): + code = self._compile_kernel() + module = hip.hipModuleLoadData(code) + kernel = hip.hipModuleGetFunction(module, "add_one") + self.assertIsNotNone(kernel) + hip.hipModuleUnload(module) + + def test_module_get_function_invalid(self): + code = self._compile_kernel() + module = hip.hipModuleLoadData(code) + with self.assertRaises(hip.hipError): + hip.hipModuleGetFunction(module, "nonexistent_kernel") + hip.hipModuleUnload(module) + + def test_kernel_launch_and_verify(self): + code = self._compile_kernel() + module = hip.hipModuleLoadData(code) + kernel = hip.hipModuleGetFunction(module, "add_one") + + n = 64 + size = n * ctypes.sizeof(ctypes.c_int) + h_input = (ctypes.c_int * n)() + for i in range(n): + h_input[i] = i + + d_ptr = hip.hipMalloc(size) + hip.hipMemcpy_htod(d_ptr, ctypes.byref(h_input), size) + + class Args(ctypes.Structure): + _fields_ = [("a", ctypes.c_void_p), ("n", ctypes.c_int)] + + args = Args(d_ptr, n) + hip.hipModuleLaunchKernel(kernel, 1, 1, 1, n, 1, 1, 0, 0, args) + hip.hipDeviceSynchronize() + + h_output = (ctypes.c_int * n)() + hip.hipMemcpy_dtoh(ctypes.byref(h_output), d_ptr, size) + + for i in range(n): + self.assertEqual(h_output[i], i + 1) + + hip.hipFree(d_ptr) + hip.hipModuleUnload(module) + + def test_kernel_launch_multiple_blocks(self): + code = self._compile_kernel() + module = hip.hipModuleLoadData(code) + kernel = hip.hipModuleGetFunction(module, "add_one") + + n = 1024 + size = n * ctypes.sizeof(ctypes.c_int) + h_input = (ctypes.c_int * n)() + for i in range(n): + h_input[i] = i * 2 + + d_ptr = hip.hipMalloc(size) + hip.hipMemcpy_htod(d_ptr, ctypes.byref(h_input), size) + + class Args(ctypes.Structure): + _fields_ = [("a", ctypes.c_void_p), ("n", ctypes.c_int)] + + args = Args(d_ptr, n) + block_size = 256 + grid_size = (n + block_size - 1) // block_size + hip.hipModuleLaunchKernel( + kernel, grid_size, 1, 1, block_size, 1, 1, 0, 0, args + ) + hip.hipDeviceSynchronize() + + h_output = (ctypes.c_int * n)() + hip.hipMemcpy_dtoh(ctypes.byref(h_output), d_ptr, size) + + for i in range(n): + self.assertEqual(h_output[i], i * 2 + 1) + + hip.hipFree(d_ptr) + hip.hipModuleUnload(module) + + def test_kernel_launch_on_stream(self): + code = self._compile_kernel() + module = hip.hipModuleLoadData(code) + kernel = hip.hipModuleGetFunction(module, "add_one") + stream = hip.hipStreamCreate() + + n = 128 + size = n * ctypes.sizeof(ctypes.c_int) + h_input = (ctypes.c_int * n)() + for i in range(n): + h_input[i] = 100 + + d_ptr = hip.hipMalloc(size) + hip.hipMemcpy_htod(d_ptr, ctypes.byref(h_input), size) + + class Args(ctypes.Structure): + _fields_ = [("a", ctypes.c_void_p), ("n", ctypes.c_int)] + + args = Args(d_ptr, n) + hip.hipModuleLaunchKernel(kernel, 1, 1, 1, n, 1, 1, 0, stream, args) + hip.hipStreamSynchronize(stream) + + h_output = (ctypes.c_int * n)() + hip.hipMemcpy_dtoh(ctypes.byref(h_output), d_ptr, size) + + for i in range(n): + self.assertEqual(h_output[i], 101) + + hip.hipFree(d_ptr) + hip.hipStreamDestroy(stream) + hip.hipModuleUnload(module) + + +if __name__ == "__main__": + unittest.main()