From 1542d442dc0975157db0c32a3fd40a58492164e9 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 15 Oct 2025 02:31:55 +0000 Subject: [PATCH 1/8] Support i1 and callable Signed-off-by: Thien Nguyen --- python/cudaq/kernel/ast_bridge.py | 167 ++++++++++++++++-- python/cudaq/kernel/kernel_decorator.py | 67 +++++-- python/cudaq/kernel/utils.py | 40 ++++- python/runtime/cudaq/algorithms/py_run.cpp | 42 +++-- .../cudaq/platform/py_alt_launch_kernel.cpp | 22 ++- python/tests/kernel/test_return_vectors.py | 132 ++++++++++++++ 6 files changed, 419 insertions(+), 51 deletions(-) create mode 100644 python/tests/kernel/test_return_vectors.py diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py index aceab7ec75a..39c64b2274f 100644 --- a/python/cudaq/kernel/ast_bridge.py +++ b/python/cudaq/kernel/ast_bridge.py @@ -434,6 +434,26 @@ def changeOperandToType(self, ty, operand, allowDemotion=False): operand, sint=operand_width != 1, zint=operand_width == 1).result + + if quake.StruqType.isinstance(ty): + if quake.StruqType.isinstance(operand.type): + # Check that the struct types are the same, only the name may differ. + targetMemberType = quake.StruqType.getTypes(ty) + operandMemberType = quake.StruqType.getTypes(operand.type) + if len(targetMemberType) != len(operandMemberType): + self.emitFatalError( + f'cannot convert value of type {operand.type} to the requested type {ty}', + self.currentNode) + for i in range(len(targetMemberType)): + if targetMemberType[i] != operandMemberType[i]: + self.emitFatalError( + f'cannot convert value of type {operand.type} to the requested type {ty}', + self.currentNode) + # It is the same struct, do a cast + structPtr = self.ifNotPointerThenStore(operand) + castedPtr = cc.CastOp(cc.PointerType.get(ty), structPtr).result + return self.ifPointerThenLoad(castedPtr) + self.emitFatalError( f'cannot convert value of type {operand.type} to the requested type {ty}', self.currentNode) @@ -578,6 +598,7 @@ def ifNotPointerThenStore(self, value): if not cc.PointerType.isinstance(value.type): slot = cc.AllocaOp(cc.PointerType.get(value.type), TypeAttr.get(value.type)).result + assert cc.PointerType.get(value.type) == slot.type cc.StoreOp(value, slot) return slot return value @@ -585,20 +606,32 @@ def ifNotPointerThenStore(self, value): def __createStdvecWithKnownValues(self, size, listElementValues): # Turn this List into a StdVec arrSize = self.getConstantInt(size) - arrTy = cc.ArrayType.get(listElementValues[0].type) + elemTy = listElementValues[0].type + # If this is an `i1`, turns it into an `i8` array. + isBool = elemTy == self.getIntegerType(1) + if isBool: + elemTy = self.getIntegerType(8) + + arrTy = cc.ArrayType.get(elemTy) alloca = cc.AllocaOp(cc.PointerType.get(arrTy), - TypeAttr.get(listElementValues[0].type), + TypeAttr.get(elemTy), seqSize=arrSize).result for i, v in enumerate(listElementValues): eleAddr = cc.ComputePtrOp( - cc.PointerType.get(listElementValues[0].type), alloca, + cc.PointerType.get(elemTy), alloca, [self.getConstantInt(i)], DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx)).result + if isBool: + # Cast the list value before assigning + v = self.changeOperandToType(self.getIntegerType(8), v) cc.StoreOp(v, eleAddr) - vecTy = listElementValues[0].type + # Create the `StdVec` from the alloca + # We still use `i1` as the vector element type if the + # original list was of booleans. + vecTy = elemTy if not isBool else self.getIntegerType(1) if cc.PointerType.isinstance(vecTy): vecTy = cc.PointerType.getElementType(vecTy) @@ -655,6 +688,10 @@ def __copyVectorAndCastElements(self, if (sourceEleType == targetEleType): return sourcePtr + isSourceBool = sourceEleType == self.getIntegerType(1) + if isSourceBool: + sourceEleType = self.getIntegerType(8) + sourceArrType = cc.ArrayType.get(sourceEleType) sourceElePtrTy = cc.PointerType.get(sourceEleType) sourceArrElePtrTy = cc.PointerType.get(sourceArrType) @@ -662,10 +699,16 @@ def __copyVectorAndCastElements(self, sourceDataPtr = cc.StdvecDataOp(sourceArrElePtrTy, sourceValue).result sourceSize = cc.StdvecSizeOp(self.getIntegerType(), sourceValue).result + isTargetBool = targetEleType == self.getIntegerType(1) + # Vector type reflects the true type, including `i1` + targetVecTy = cc.StdvecType.get(targetEleType) + + if isTargetBool: + targetEleType = self.getIntegerType(8) + targetElePtrType = cc.PointerType.get(targetEleType) targetTy = cc.ArrayType.get(targetEleType) targetArrElePtrTy = cc.PointerType.get(targetTy) - targetVecTy = cc.StdvecType.get(targetEleType) targetPtr = cc.AllocaOp(targetArrElePtrTy, TypeAttr.get(targetEleType), seqSize=sourceSize).result @@ -681,6 +724,7 @@ def bodyBuilder(iterVar): allowDemotion=allowDemotion) targetEleAddr = cc.ComputePtrOp(targetElePtrType, targetPtr, [iterVar], rawIndex).result + assert cc.PointerType.get(targetEleType) == targetEleAddr.type cc.StoreOp(castedEle, targetEleAddr) self.createInvariantForLoop(sourceSize, bodyBuilder) @@ -777,15 +821,26 @@ def __load_vector_element(self, vector, index): MLIR Value containing the loaded element """ if cc.StdvecType.isinstance(vector.type): + elem_ty = cc.StdvecType.getElementType(vector.type) + is_bool = elem_ty == self.getIntegerType(1) + # std::vector is a special case in C++ where each element + # is stored as a single bit, but the underlying array is actually + # an array of `i8` values. + if is_bool: + # `i1` elements are stored as `i8` in the underlying array. + elem_ty = self.getIntegerType(8) data_ptr = cc.StdvecDataOp( cc.PointerType.get( - cc.ArrayType.get(cc.StdvecType.getElementType( - vector.type))), vector).result - return cc.LoadOp( + cc.ArrayType.get(elem_ty)), vector).result + load_val = cc.LoadOp( cc.ComputePtrOp( - cc.PointerType.get(cc.StdvecType.getElementType( - vector.type)), data_ptr, [index], + cc.PointerType.get(elem_ty), data_ptr, [index], DenseI32ArrayAttr.get([kDynamicPtrIndex]))).result + if is_bool: + # Cast back to `i1` if the original vector element type was `i1`. + load_val = self.changeOperandToType(self.getIntegerType(1), + load_val) + return load_val return cc.LoadOp( cc.ComputePtrOp( cc.PointerType.get( @@ -1405,6 +1460,7 @@ def process_assignment(target, value): # We should allocate and store alloca = cc.AllocaOp(cc.PointerType.get(value.type), TypeAttr.get(value.type)).result + assert cc.PointerType.get(value.type) == alloca.type cc.StoreOp(value, alloca) return target, alloca @@ -1438,6 +1494,8 @@ def process_assignment(target, value): # Visit the value being assigned self.visit(node.value) valueToStore = self.popValue() + # Cast if necessary + valueToStore = self.changeOperandToType(ptrEleType, valueToStore) # Store the value cc.StoreOp(valueToStore, ptrVal) return target.value, None @@ -1460,6 +1518,8 @@ def process_assignment(target, value): # Visit the value being assigned self.visit(node.value) valueToStore = self.popValue() + # Cast if necessary + valueToStore = self.changeOperandToType(cc.PointerType.getElementType(ptrVal.type), valueToStore) # Store the value cc.StoreOp(valueToStore, ptrVal) return target.value, None @@ -1771,6 +1831,26 @@ def processFunctionCall(fType, nrValsToPop): func.CallOp(otherKernel, values) else: result = func.CallOp(otherKernel, values).result + # Copy to stack if necessary + if cc.StdvecType.isinstance(result.type): + elemTy = cc.StdvecType.getElementType(result.type) + if elemTy == self.getIntegerType(1): + elemTy = self.getIntegerType(8) + data = cc.StdvecDataOp(cc.PointerType.get(elemTy), result).result + i64Ty = self.getIntegerType(64) + length = cc.StdvecSizeOp(i64Ty, result).result + elemSize = cc.SizeOfOp(i64Ty, TypeAttr.get(elemTy)).result + buffer = cc.AllocaOp(cc.PointerType.get(cc.ArrayType.get(elemTy)), TypeAttr.get(elemTy), seqSize=length).result + i8PtrTy = cc.PointerType.get(self.getIntegerType(8)) + cbuffer = cc.CastOp(i8PtrTy, buffer).result + cdata = cc.CastOp(i8PtrTy, data).result + symName = '__nvqpp_vectorCopyToStack' + load_intrinsic(self.module, symName) + sizeInBytes = arith.MulIOp(length, elemSize).result + func.CallOp([], symName, [cbuffer, cdata, sizeInBytes]) + # Replace result with the stack buffer-backed vector + result = cc.StdvecInitOp(result.type, buffer, length=length).result + self.pushValue(result) def checkControlAndTargetTypes(controls, targets): @@ -2474,7 +2554,18 @@ def bodyBuilder(iterVal): cc.StoreOp(ctorArgs[i], eleAddr) self.pushValue(stackSlot) return - + # Check generic callable objects that may be C++ kernels + elif hasattr(var, '__call__') and hasattr(var, '__module__') and hasattr(var, '__name__'): + # This is a callable object, likely a C++ kernel + devKey = f"{var.__module__}.{var.__name__}" + if cudaq_runtime.isRegisteredDeviceModule(devKey): + maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( + self.module, devKey) + if maybeKernelName != None: + otherKernel = SymbolTable( + self.module.operation)[maybeKernelName] + processFunctionCall(otherKernel.type, len(node.args)) + return else: self.emitFatalError( "unhandled function call - {}, known kernels are {}".format( @@ -2915,6 +3006,30 @@ def bodyBuilder(iterVal): quake.ComputeActionOp(compute, action) return + if node.func.attr == 'to_integer': + boolVec = self.popValue() + boolVec = self.ifPointerThenLoad(boolVec) + if not cc.StdvecType.isinstance(boolVec.type): + self.emitFatalError( + "to_integer expects a vector of booleans. Got type {}".format( + boolVec.type), + node) + elemTy = cc.StdvecType.getElementType(boolVec.type) + if elemTy != self.getIntegerType(1): + self.emitFatalError( + "to_integer expects a vector of booleans. Got type {}".format( + boolVec.type), + node) + cudaqConvertToInteger = "__nvqpp_cudaqConvertToInteger" + # Load the intrinsic + load_intrinsic(self.module, cudaqConvertToInteger) + # Signature: + # `func.func private @__nvqpp_cudaqConvertToInteger(%arg : !cc.stdvec) -> i64` + resultTy = self.getIntegerType(64) + result = func.CallOp([resultTy], cudaqConvertToInteger, [boolVec]).result + self.pushValue(result) + return + self.emitFatalError( f'Invalid function or class type requested from the cudaq module ({node.func.attr})', node) @@ -3453,6 +3568,11 @@ def get_item_type(pyval): listElemTy = get_item_type(node.elt) if listElemTy is None: return + + resultVecTy = cc.StdvecType.get(listElemTy) + isBool = listElemTy == self.getIntegerType(1) + if isBool: + listElemTy = self.getIntegerType(8) listTy = cc.ArrayType.get(listElemTy) listValue = cc.AllocaOp(cc.PointerType.get(listTy), TypeAttr.get(listElemTy), @@ -3482,12 +3602,15 @@ def bodyBuilder(iterVar): listValueAddr = cc.ComputePtrOp( cc.PointerType.get(listElemTy), listValue, [iterVar], DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx)) + + if isBool: + result = self.changeOperandToType(self.getIntegerType(8), result) cc.StoreOp(result, listValueAddr) self.symbolTable.popScope() self.createInvariantForLoop(iterableSize, bodyBuilder) self.pushValue( - cc.StdvecInitOp(cc.StdvecType.get(listElemTy), + cc.StdvecInitOp(resultVecTy, listValue, length=iterableSize).result) return @@ -3679,6 +3802,9 @@ def fix_negative_idx(idx, get_size): upper=upperVal).result) elif cc.StdvecType.isinstance(var.type): eleTy = cc.StdvecType.getElementType(var.type) + isBool = eleTy == self.getIntegerType(1) + if isBool: + eleTy = self.getIntegerType(8) ptrTy = cc.PointerType.get(eleTy) arrTy = cc.ArrayType.get(eleTy) ptrArrTy = cc.PointerType.get(arrTy) @@ -3722,6 +3848,9 @@ def fix_negative_idx(idx, get_size): if cc.StdvecType.isinstance(var.type): idx = fix_negative_idx(idx, lambda: get_size(var)) eleTy = cc.StdvecType.getElementType(var.type) + isBool = eleTy == self.getIntegerType(1) + if isBool: + eleTy = self.getIntegerType(8) elePtrTy = cc.PointerType.get(eleTy) arrTy = cc.ArrayType.get(eleTy) ptrArrTy = cc.PointerType.get(arrTy) @@ -3733,7 +3862,10 @@ def fix_negative_idx(idx, get_size): if self.subscriptPushPointerValue: self.pushValue(eleAddr) return - self.pushValue(cc.LoadOp(eleAddr).result) + val = cc.LoadOp(eleAddr).result + if isBool: + val = self.changeOperandToType(self.getIntegerType(1), val) + self.pushValue(val) return if cc.PointerType.isinstance(var.type): @@ -3960,7 +4092,9 @@ def functor(iter, idx): iterEleTy = cc.StdvecType.getElementType(iterable.type) totalSize = cc.StdvecSizeOp(self.getIntegerType(), iterable).result - + isBool = iterEleTy == self.getIntegerType(1) + if isBool: + iterEleTy = self.getIntegerType(8) def functor(iter, idxVal): elePtrTy = cc.PointerType.get(iterEleTy) arrTy = cc.ArrayType.get(iterEleTy) @@ -3970,7 +4104,10 @@ def functor(iter, idxVal): elePtrTy, vecPtr, [idxVal], DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx)).result - return cc.LoadOp(eleAddr).result + result = cc.LoadOp(eleAddr).result + if isBool: + result = self.changeOperandToType(self.getIntegerType(1), result) + return result extractFunctor = functor diff --git a/python/cudaq/kernel/kernel_decorator.py b/python/cudaq/kernel/kernel_decorator.py index 799117a07bc..f1a685468a3 100644 --- a/python/cudaq/kernel/kernel_decorator.py +++ b/python/cudaq/kernel/kernel_decorator.py @@ -451,6 +451,24 @@ def __convertStringsToPauli__(self, arg): return arg + def getCallableNames(self, *args): + callableNames = [] + for arg in args: + if isinstance(arg, PyKernelDecorator): + callableNames.append(arg.name) + else: + if hasattr(arg, '__call__') and hasattr(arg, '__module__') and hasattr(arg, '__name__'): + # This is a callable object, likely a C++ kernel + devKey = f"{arg.__module__}.{arg.__name__}" + if cudaq_runtime.isRegisteredDeviceModule(devKey): + maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( + self.module, devKey) + if maybeKernelName != None: + # Remove "__nvqpp__mlirgen__" prefix + maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "") + callableNames.append(maybeKernelName) + return callableNames + def __call__(self, *args): """ Invoke the CUDA-Q kernel. JIT compilation of the kernel AST to MLIR @@ -481,7 +499,8 @@ def __call__(self, *args): mlirType = mlirTypeFromPyType(type(arg), self.module.context, argInstance=arg, - argTypeToCompareTo=self.argTypes[i]) + argTypeToCompareTo=self.argTypes[i], + module=self.module) if self.isCastablePyType(mlirType, self.argTypes[i]): processedArgs.append( @@ -496,19 +515,39 @@ def __call__(self, *args): ) if cc.CallableType.isinstance(mlirType): - # Assume this is a PyKernelDecorator - callableNames.append(arg.name) - # It may be that the provided input callable kernel - # is not currently in the ModuleOp. Need to add it - # if that is the case, we have to use the AST - # so that it shares self.module's MLIR Context - symbols = SymbolTable(self.module.operation) - if nvqppPrefix + arg.name not in symbols: - tmpBridge = PyASTBridge(self.capturedDataStorage, - existingModule=self.module, - disableEntryPointTag=True) - tmpBridge.visit(globalAstRegistry[arg.name][0]) - + if isinstance(arg, PyKernelDecorator): + # Assume this is a PyKernelDecorator + callableNames.append(arg.name) + # It may be that the provided input callable kernel + # is not currently in the ModuleOp. Need to add it + # if that is the case, we have to use the AST + # so that it shares self.module's MLIR Context + symbols = SymbolTable(self.module.operation) + if nvqppPrefix + arg.name not in symbols: + tmpBridge = PyASTBridge(self.capturedDataStorage, + existingModule=self.module, + disableEntryPointTag=True) + tmpBridge.visit(globalAstRegistry[arg.name][0]) + else: + if hasattr(arg, '__call__') and hasattr(arg, '__module__') and hasattr(arg, '__name__'): + # This is a callable object, likely a C++ kernel + devKey = f"{arg.__module__}.{arg.__name__}" + if cudaq_runtime.isRegisteredDeviceModule(devKey): + print("111Found registered device module for callable object:", devKey) + + maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( + self.module, devKey) + if maybeKernelName != None: + otherKernel = SymbolTable( + self.module.operation)[maybeKernelName] + print("Found Other kernel:", otherKernel) + # Remove "__nvqpp__mlirgen__" prefix + maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "") + callableNames.append(maybeKernelName) + else: + emitFatalError( + "Invalid callable argument provided to kernel." + ) # Convert `numpy` arrays to lists if cc.StdvecType.isinstance(mlirType) and hasattr(arg, "tolist"): if arg.ndim != 1: diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py index efaf213b581..c95c9a42cdf 100644 --- a/python/cudaq/kernel/utils.py +++ b/python/cudaq/kernel/utils.py @@ -15,8 +15,8 @@ import types from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime -from cudaq.mlir.dialects import quake, cc -from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType +from cudaq.mlir.dialects import quake, cc, func +from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType, SymbolTable State = cudaq_runtime.State qvector = cudaq_runtime.qvector @@ -442,7 +442,41 @@ def mlirTypeFromPyType(argType, ctx, **kwargs): if 'argInstance' in kwargs: argInstance = kwargs['argInstance'] if isinstance(argInstance, Callable): - return cc.CallableType.get(argInstance.argTypes, ctx) + if hasattr(argInstance, 'argTypes'): + print("Found Callable with argTypes:", argInstance.argTypes) + return cc.CallableType.get(argInstance.argTypes, ctx) + elif hasattr(argInstance, '__call__') and hasattr(argInstance, '__module__') and hasattr(argInstance, '__name__'): + # This is a callable object, likely a C++ kernel + devKey = f"{argInstance.__module__}.{argInstance.__name__}" + if cudaq_runtime.isRegisteredDeviceModule(devKey): + print("Found registered device module for callable object:", devKey) + if "module" in kwargs: + module = kwargs['module'] + maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( + module, devKey) + if maybeKernelName != None: + otherKernel = SymbolTable( + module.operation)[maybeKernelName] + print("Found registered C++ kernel:", maybeKernelName) + print("Other kernel type:", otherKernel.type) + print("Other kernel:", otherKernel) + if isinstance(otherKernel, func.FuncOp): + print("HEY:", dir(otherKernel.type)) + print("HOW:", otherKernel.arguments) + argTypes = [] + for arg in otherKernel.arguments: + print("ARG TYPE:", arg.type) + argTypes.append(arg.type) + return cc.CallableType.get(argTypes, ctx) + else: + emitFatalError( + f"Registered C++ kernel '{maybeKernelName}' is not of CallableType." + ) + # maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(self.module, devKey) + # if maybeKernelName != None: + # otherKernel = SymbolTable( + # self.module.operation)[maybeKernelName] + # processFunctionCall(otherKernel.type, len(node.args)) for name in globalRegisteredTypes.classes: customTy, memberTys = globalRegisteredTypes.getClassAttributes(name) diff --git a/python/runtime/cudaq/algorithms/py_run.cpp b/python/runtime/cudaq/algorithms/py_run.cpp index 8cd38a7295b..ef59b14f461 100644 --- a/python/runtime/cudaq/algorithms/py_run.cpp +++ b/python/runtime/cudaq/algorithms/py_run.cpp @@ -39,7 +39,8 @@ static std::vector readRunResults(mlir::ModuleOp module, } static std::tuple + mlir::func::FuncOp, std::string, mlir::func::FuncOp, + std::vector> getKernelLaunchParameters(py::object &kernel, py::args args) { if (!py::hasattr(kernel, "arguments")) throw std::runtime_error( @@ -52,6 +53,11 @@ getKernelLaunchParameters(py::object &kernel, py::args args) { if (py::hasattr(kernel, "compile")) kernel.attr("compile")(); + std::vector callableNames; + if (py::hasattr(kernel, "getCallableNames")) + callableNames = + kernel.attr("getCallableNames")(*args).cast>(); + auto origKernName = kernel.attr("name").cast(); auto kernelName = origKernName + ".run"; if (!py::hasattr(kernel, "module") || kernel.attr("module").is_none()) @@ -78,7 +84,8 @@ getKernelLaunchParameters(py::object &kernel, py::args args) { } auto *argData = toOpaqueArgs(args, kernelMod, kernelName); auto funcOp = getKernelFuncOp(kernelMod, kernelName); - return {kernelName, kernelMod, argData, funcOp, origKernName, origKern}; + return {kernelName, kernelMod, argData, funcOp, + origKernName, origKern, callableNames}; } static details::RunResultSpan @@ -86,6 +93,7 @@ pyRunTheKernel(const std::string &name, const std::string &origName, MlirModule module, mlir::func::FuncOp funcOp, mlir::func::FuncOp origKernel, OpaqueArguments &runtimeArgs, quantum_platform &platform, std::size_t shots_count, + const std::vector &callableNames, std::size_t qpu_id = 0) { auto returnTypes = origKernel.getResultTypes(); if (returnTypes.empty() || returnTypes.size() > 1) @@ -93,21 +101,24 @@ pyRunTheKernel(const std::string &name, const std::string &origName, "`cudaq.run` only supports kernels that return a value."); auto returnTy = returnTypes[0]; - // Disallow returning list / vectors from entry-point kernels. - if (returnTy.isa()) { - throw std::runtime_error("`cudaq.run` does not yet support returning " - "`list` from entry-point kernels."); + // Disallow returning nested vectors from entry-point kernels. + if (auto vecTy = dyn_cast(returnTy)) { + auto elemTy = vecTy.getElementType(); + if (elemTy.isa()) + throw std::runtime_error( + "`cudaq.run` does not yet support returning nested `list` from " + "entry-point kernels."); } auto mod = unwrap(module); - auto [rawArgs, size, returnOffset, thunk] = - pyAltLaunchKernelBase(name, module, returnTy, runtimeArgs, {}, 0, false); + auto [rawArgs, size, returnOffset, thunk] = pyAltLaunchKernelBase( + name, module, returnTy, runtimeArgs, callableNames, 0, false); auto results = details::runTheKernel( [&]() mutable { pyLaunchKernel(name, thunk, mod, runtimeArgs, rawArgs, size, - returnOffset, {}); + returnOffset, callableNames); }, platform, name, origName, shots_count, qpu_id); @@ -133,7 +144,7 @@ std::vector pyRun(py::object &kernel, py::args args, if (shots_count == 0) return {}; - auto [name, module, argData, func, origName, origKern] = + auto [name, module, argData, func, origName, origKern, callableNames] = getKernelLaunchParameters(kernel, args); auto mod = unwrap(module); @@ -149,7 +160,7 @@ std::vector pyRun(py::object &kernel, py::args args, } auto span = pyRunTheKernel(name, origName, module, func, origKern, *argData, - platform, shots_count); + platform, shots_count, callableNames); delete argData; auto results = pyReadResults(span, module, func, origKern, shots_count); @@ -184,7 +195,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args, ") exceeds the number of available QPUs (" + std::to_string(numQPUs) + ")"); - auto [name, module, argData, func, origName, origKern] = + auto [name, module, argData, func, origName, origKern, callableNames] = getKernelLaunchParameters(kernel, args); auto mod = unwrap(module); @@ -219,7 +230,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args, QuantumTask wrapped = detail::make_copyable_function( [sp = std::move(spanPromise), ep = std::move(errorPromise), shots_count, qpu_id, argData, name, module, func, origKern, origName, - noise_model = std::move(noise_model)]() mutable { + noise_model = std::move(noise_model), callableNames]() mutable { auto &platform = get_platform(); // Launch the kernel in the appropriate context. @@ -227,8 +238,9 @@ async_run_result pyRunAsync(py::object &kernel, py::args args, platform.set_noise(&noise_model.value()); try { - auto span = pyRunTheKernel(name, origName, module, func, origKern, - *argData, platform, shots_count, qpu_id); + auto span = + pyRunTheKernel(name, origName, module, func, origKern, *argData, + platform, shots_count, callableNames, qpu_id); delete argData; sp.set_value(span); ep.set_value(""); diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp index 9db3e9e431f..16ba64bd849 100644 --- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp +++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp @@ -117,8 +117,22 @@ OpaqueArguments *toOpaqueArgs(py::args &args, MlirModule mod, auto *argData = new cudaq::OpaqueArguments(); args = simplifiedValidateInputArguments(args); setDataLayout(mod); - cudaq::packArgs(*argData, args, kernelFunc, - [](OpaqueArguments &, py::object &) { return false; }); + auto callableArgHandler = [](cudaq::OpaqueArguments &argData, + py::object &arg) { + if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) { + printf("Handling callable argument.\n"); + // Just give it some dummy data that will not be used. + // We synthesize away all callables, the block argument + // remains but it is not used, so just give argsCreator + // something, and we'll make sure its cleaned up. + long *ourAllocatedArg = new long(); + argData.emplace_back(ourAllocatedArg, + [](void *ptr) { delete static_cast(ptr); }); + return true; + } + return false; + }; + cudaq::packArgs(*argData, args, kernelFunc, callableArgHandler); return argData; } @@ -157,7 +171,6 @@ ExecutionEngine *jitKernel(const std::string &name, MlirModule module, pm.addPass(cudaq::opt::createGenerateKernelExecution( {.startingArgIdx = startingArgIdx})); pm.addPass(cudaq::opt::createGenerateDeviceCodeLoader({.jitTime = true})); - pm.addPass(cudaq::opt::createReturnToOutputLog()); pm.addPass(cudaq::opt::createLambdaLiftingPass()); pm.addPass(cudaq::opt::createDistributedDeviceCall()); std::string tl = getTransportLayer(); @@ -947,7 +960,8 @@ void bindAltLaunchKernel(py::module &mod, auto callableArgHandler = [](cudaq::OpaqueArguments &argData, py::object &arg) { - if (py::hasattr(arg, "module")) { + if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) { + printf("Handling callable argument.\n"); // Just give it some dummy data that will not be used. // We synthesize away all callables, the block argument // remains but it is not used, so just give argsCreator diff --git a/python/tests/kernel/test_return_vectors.py b/python/tests/kernel/test_return_vectors.py new file mode 100644 index 00000000000..376aab05613 --- /dev/null +++ b/python/tests/kernel/test_return_vectors.py @@ -0,0 +1,132 @@ +import cudaq +import pytest +import os + + +def testReturnVectorBool(): + + @cudaq.kernel + def return_vec_bool() -> list[bool]: + ret = [True, False] + return ret + + res = cudaq.run(return_vec_bool, shots_count=1) + assert res == [[True, False]] + + @cudaq.kernel + def return_vec_bool_from_measure() -> list[bool]: + q = cudaq.qvector(5) + x(q) + ret = mz(q) + return ret + + res = cudaq.run(return_vec_bool_from_measure, shots_count=10) + assert res == [[True] * 5] * 10 + + @cudaq.kernel + def return_vec_bool_from_measure_mix() -> list[bool]: + q = cudaq.qvector(5) + for i in range(5): + if i % 2 == 0: + x(q[i]) + ret = mz(q) + return ret + + res = cudaq.run(return_vec_bool_from_measure_mix, shots_count=10) + assert res == [[True, False, True, False, True]] * 10 + + +def testReturnVectorInt(): + + @cudaq.kernel + def return_vec_int() -> list[int]: + ret = [1, 2, 3] + return ret + + res = cudaq.run(return_vec_int, shots_count=1) + assert res == [[1, 2, 3]] + + @cudaq.kernel + def return_vec_int_from_measure() -> list[int]: + q = cudaq.qvector(5) + x(q) + ret = mz(q) + int_ret = [0 for b in ret] + i = 0 + for b in ret: + if b: + int_ret[i] = 6 + i += 1 + return int_ret + + res = cudaq.run(return_vec_int_from_measure, shots_count=10) + assert res == [[6] * 5] * 10 + + @cudaq.kernel + def return_vec_int_from_measure_mix() -> list[int]: + q = cudaq.qvector(5) + for i in range(5): + if i % 2 == 0: + x(q[i]) + ret = mz(q) + int_ret = [0 for b in ret] + i = 0 + for b in ret: + if b: + int_ret[i] = 6 + i += 1 + return int_ret + + res = cudaq.run(return_vec_int_from_measure_mix, shots_count=10) + assert res == [[6, 0, 6, 0, 6]] * 10 + + +def testReturnVectorFloat(): + + @cudaq.kernel + def return_vec_float() -> list[float]: + ret = [1.1, 2.2, 3.3] + return ret + + res = cudaq.run(return_vec_float, shots_count=1) + assert res == [[1.1, 2.2, 3.3]] + + @cudaq.kernel + def return_vec_float_from_measure() -> list[float]: + q = cudaq.qvector(5) + x(q) + ret = mz(q) + float_ret = [0.0 for b in ret] + i = 0 + for b in ret: + if b: + float_ret[i] = 6.6 + i += 1 + return float_ret + + res = cudaq.run(return_vec_float_from_measure, shots_count=10) + assert res == [[6.6] * 5] * 10 + + @cudaq.kernel + def return_vec_float_from_measure_mix() -> list[float]: + q = cudaq.qvector(5) + for i in range(5): + if i % 2 == 0: + x(q[i]) + ret = mz(q) + float_ret = [0.0 for b in ret] + i = 0 + for b in ret: + if b: + float_ret[i] = 6.6 + i += 1 + return float_ret + + res = cudaq.run(return_vec_float_from_measure_mix, shots_count=10) + assert res == [[6.6, 0.0, 6.6, 0.0, 6.6]] * 10 + + +# leave for gdb debugging +if __name__ == "__main__": + loc = os.path.abspath(__file__) + pytest.main([loc, "-s"]) From 7e681a3dbebf78d372ad1f48df95a0a1057cd626 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Tue, 21 Oct 2025 07:05:18 +0000 Subject: [PATCH 2/8] Add tests Signed-off-by: Thien Nguyen --- python/cudaq/kernel/ast_bridge.py | 27 ++++--- .../cudaq/platform/py_alt_launch_kernel.cpp | 2 - .../tests/interop/quantum_lib/quantum_lib.cpp | 8 +++ .../tests/interop/quantum_lib/quantum_lib.h | 10 +++ .../test_cpp_quantum_algorithm_module.cpp | 71 +++++++++++++++++++ python/tests/interop/test_interop.py | 58 +++++++++++++++ 6 files changed, 163 insertions(+), 13 deletions(-) diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py index 39c64b2274f..59665ef9292 100644 --- a/python/cudaq/kernel/ast_bridge.py +++ b/python/cudaq/kernel/ast_bridge.py @@ -2555,17 +2555,22 @@ def bodyBuilder(iterVal): self.pushValue(stackSlot) return # Check generic callable objects that may be C++ kernels - elif hasattr(var, '__call__') and hasattr(var, '__module__') and hasattr(var, '__name__'): - # This is a callable object, likely a C++ kernel - devKey = f"{var.__module__}.{var.__name__}" - if cudaq_runtime.isRegisteredDeviceModule(devKey): - maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( - self.module, devKey) - if maybeKernelName != None: - otherKernel = SymbolTable( - self.module.operation)[maybeKernelName] - processFunctionCall(otherKernel.type, len(node.args)) - return + elif hasattr(var, '__call__'): + # This is a callable object, which could be a C++ kernel + # Get the full module + name key and see if it is registered + modulePath = str(var.__module__) if hasattr( + var, '__module__') else '' + funcName = str(var.__name__) if hasattr( + var, '__name__') else '' + devKey = f"{modulePath}.{funcName}" + if cudaq_runtime.isRegisteredDeviceModule(devKey): + maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( + self.module, devKey) + if maybeKernelName != None: + otherKernel = SymbolTable( + self.module.operation)[maybeKernelName] + processFunctionCall(otherKernel.type, len(node.args)) + return else: self.emitFatalError( "unhandled function call - {}, known kernels are {}".format( diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp index 16ba64bd849..5543a54b37d 100644 --- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp +++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp @@ -120,7 +120,6 @@ OpaqueArguments *toOpaqueArgs(py::args &args, MlirModule mod, auto callableArgHandler = [](cudaq::OpaqueArguments &argData, py::object &arg) { if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) { - printf("Handling callable argument.\n"); // Just give it some dummy data that will not be used. // We synthesize away all callables, the block argument // remains but it is not used, so just give argsCreator @@ -961,7 +960,6 @@ void bindAltLaunchKernel(py::module &mod, auto callableArgHandler = [](cudaq::OpaqueArguments &argData, py::object &arg) { if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) { - printf("Handling callable argument.\n"); // Just give it some dummy data that will not be used. // We synthesize away all callables, the block argument // remains but it is not used, so just give argsCreator diff --git a/python/tests/interop/quantum_lib/quantum_lib.cpp b/python/tests/interop/quantum_lib/quantum_lib.cpp index 7c5cbb23054..ed313b32f88 100644 --- a/python/tests/interop/quantum_lib/quantum_lib.cpp +++ b/python/tests/interop/quantum_lib/quantum_lib.cpp @@ -29,4 +29,12 @@ __qpu__ void qft(cudaq::qview<> qubits, const std::vector &x, __qpu__ void another(cudaq::qview<> qubits, std::size_t i) { x(qubits[i]); } __qpu__ void uccsd(cudaq::qview<> qubits, std::size_t) { h(qubits[0]); } + +__qpu__ void reset_group(patch p) { + for (std::size_t i = 0; i < p.data.size(); i++) + reset(p.data[i]); +} + +__qpu__ void x_group(patch p) { x(p.data); } + } // namespace cudaq diff --git a/python/tests/interop/quantum_lib/quantum_lib.h b/python/tests/interop/quantum_lib/quantum_lib.h index a0655099237..4b9fa371351 100644 --- a/python/tests/interop/quantum_lib/quantum_lib.h +++ b/python/tests/interop/quantum_lib/quantum_lib.h @@ -9,6 +9,12 @@ #include "cudaq/qis/qubit_qis.h" +// Custom data structure +struct patch { + cudaq::qview<> data; + cudaq::qview<> aux; +}; + namespace cudaq { void entryPoint(const std::function &)> &statePrep); @@ -19,4 +25,8 @@ void another(cudaq::qview<> qubits, std::size_t); void uccsd(cudaq::qview<> qubits, std::size_t); +void reset_group(patch p); + +void x_group(patch p); + } // namespace cudaq diff --git a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp index 4ea2d2176cc..9d0b54bfa57 100644 --- a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp +++ b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp @@ -8,6 +8,7 @@ #include "cudaq.h" #include "cudaq/algorithms/sample.h" +#include "cudaq/qis/qkernel.h" #include "quantum_lib/quantum_lib.h" #include "runtime/interop/PythonCppInterop.h" #include @@ -15,6 +16,22 @@ namespace py = pybind11; +namespace { +static std::unordered_map, std::size_t)>> + g_cppKernels_1; + +static std::unordered_map> + g_cppKernels_2; + +static const bool initKernels = []() { + g_cppKernels_1.insert(std::make_pair("uccsd", cudaq::uccsd)); + g_cppKernels_2.insert(std::make_pair("reset", cudaq::reset_group)); + g_cppKernels_2.insert(std::make_pair("x", cudaq::x_group)); + return true; +}(); +} // namespace + PYBIND11_MODULE(cudaq_test_cpp_algo, m) { m.def("test_cpp_qalgo", [](py::object statePrepIn) { @@ -49,4 +66,58 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) { cudaq::python::addDeviceKernelInterop, std::size_t>( m, "qstd", "uccsd", ""); + + // Convert the C++ kernel registry to Python-accessible kernels + auto interopSubMod = m.def_submodule("_cpp_interop_kernels"); + static std::unordered_map g_py_kernels; + + for (auto &[name, kernel] : g_cppKernels_1) { + const char *qkernelName = cudaq::registry::getLinkableKernelNameOrNull( + cudaq::registry::__cudaq_getLinkableKernelKey(&kernel)); + if (!qkernelName) { + throw std::runtime_error("Could not find registered kernel name for " + + name); + } + + std::string kernelName = qkernelName; + if (kernelName.starts_with("function_")) + kernelName = kernelName.substr(std::string("function_").length()); + + interopSubMod.def( + kernelName.c_str(), [](py::object qview, std::size_t i) {}, + "Auto-generated one-qubit encoding kernel from C++ code"); + cudaq::python::registerDeviceKernel( + interopSubMod.attr("__name__").cast(), kernelName, ""); + g_py_kernels.insert( + std::make_pair(name, interopSubMod.attr(kernelName.c_str()))); + } + + for (auto &[name, kernel] : g_cppKernels_2) { + const char *qkernelName = cudaq::registry::getLinkableKernelNameOrNull( + cudaq::registry::__cudaq_getLinkableKernelKey(&kernel)); + if (!qkernelName) { + throw std::runtime_error("Could not find registered kernel name for " + + name); + } + + std::string kernelName = qkernelName; + if (kernelName.starts_with("function_")) + kernelName = kernelName.substr(std::string("function_").length()); + + interopSubMod.def( + kernelName.c_str(), [](py::object patch) {}, + "Auto-generated one-qubit encoding kernel from C++ code"); + cudaq::python::registerDeviceKernel( + interopSubMod.attr("__name__").cast(), kernelName, ""); + g_py_kernels.insert( + std::make_pair(name, interopSubMod.attr(kernelName.c_str()))); + } + + m.def("get_cpp_kernel", [](const std::string &name) { + auto it = g_py_kernels.find(name); + if (it == g_py_kernels.end()) + throw std::runtime_error("No C++ kernel registered for requested name."); + + return it->second; + }); } diff --git a/python/tests/interop/test_interop.py b/python/tests/interop/test_interop.py index e63588408ac..7e1e7e05fe9 100644 --- a/python/tests/interop/test_interop.py +++ b/python/tests/interop/test_interop.py @@ -7,6 +7,8 @@ # ============================================================================ # import cudaq, pytest +from typing import Callable +from dataclasses import dataclass cudaq_test_cpp_algo = pytest.importorskip('cudaq_test_cpp_algo') @@ -242,3 +244,59 @@ def entry(): takesCapture(spin) entry.compile() + +def test_cpp_qkernel(): + # Test the `qkernel` provided in C++ via a map-like registry. + # This is provided as a function-like callable. + kernel_from_cpp_registry = cudaq_test_cpp_algo.get_cpp_kernel("uccsd") + + # Use as a capture + @cudaq.kernel + def cpp_qkernel(): + q = cudaq.qvector(4) + kernel_from_cpp_registry(q, 0) + + cpp_qkernel() + + + # Use as a callable argument + @cudaq.kernel + def caller(k: Callable[[cudaq.qview, int], None]): + q = cudaq.qvector(4) + k(q, 0) + + caller(kernel_from_cpp_registry) + + +def test_cpp_custom_struct(): + # Define a struct in Python that matches the C++ struct + @dataclass(slots=True) + class patch: + data: cudaq.qvector + aux: cudaq.qvector + + reset_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("reset") + x_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("x") + + # Use as a capture + @cudaq.kernel + def cpp_qkernel_struct(): + q = cudaq.qvector(4) + r = cudaq.qvector(2) + x(q) + reset_qkernel(patch(q, r)) + + counts = cudaq.sample(cpp_qkernel_struct) + counts.dump() + assert len(counts) == 1 and '000000' in counts + + @cudaq.kernel + def cpp_qkernel_struct_x(): + q = cudaq.qvector(4) + r = cudaq.qvector(2) + x_qkernel(patch(q, r)) + + counts = cudaq.sample(cpp_qkernel_struct_x) + counts.dump() + assert len(counts) == 1 and '111100' in counts + From 7c27ba50dcea0e55e235543d9ad350e1bb397113 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Tue, 21 Oct 2025 07:09:25 +0000 Subject: [PATCH 3/8] Remove temp code Signed-off-by: Thien Nguyen --- python/cudaq/kernel/utils.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py index c95c9a42cdf..8c0b62327e3 100644 --- a/python/cudaq/kernel/utils.py +++ b/python/cudaq/kernel/utils.py @@ -443,13 +443,11 @@ def mlirTypeFromPyType(argType, ctx, **kwargs): argInstance = kwargs['argInstance'] if isinstance(argInstance, Callable): if hasattr(argInstance, 'argTypes'): - print("Found Callable with argTypes:", argInstance.argTypes) return cc.CallableType.get(argInstance.argTypes, ctx) elif hasattr(argInstance, '__call__') and hasattr(argInstance, '__module__') and hasattr(argInstance, '__name__'): # This is a callable object, likely a C++ kernel devKey = f"{argInstance.__module__}.{argInstance.__name__}" if cudaq_runtime.isRegisteredDeviceModule(devKey): - print("Found registered device module for callable object:", devKey) if "module" in kwargs: module = kwargs['module'] maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( @@ -457,26 +455,15 @@ def mlirTypeFromPyType(argType, ctx, **kwargs): if maybeKernelName != None: otherKernel = SymbolTable( module.operation)[maybeKernelName] - print("Found registered C++ kernel:", maybeKernelName) - print("Other kernel type:", otherKernel.type) - print("Other kernel:", otherKernel) if isinstance(otherKernel, func.FuncOp): - print("HEY:", dir(otherKernel.type)) - print("HOW:", otherKernel.arguments) argTypes = [] for arg in otherKernel.arguments: - print("ARG TYPE:", arg.type) argTypes.append(arg.type) return cc.CallableType.get(argTypes, ctx) else: emitFatalError( f"Registered C++ kernel '{maybeKernelName}' is not of CallableType." ) - # maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(self.module, devKey) - # if maybeKernelName != None: - # otherKernel = SymbolTable( - # self.module.operation)[maybeKernelName] - # processFunctionCall(otherKernel.type, len(node.args)) for name in globalRegisteredTypes.classes: customTy, memberTys = globalRegisteredTypes.getClassAttributes(name) From d4579af48d70d6df7c2acdbe905172bbb721952c Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Tue, 21 Oct 2025 07:19:31 +0000 Subject: [PATCH 4/8] Add to_integer test for Python Signed-off-by: Thien Nguyen --- python/tests/kernel/test_to_integer.py | 41 ++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 python/tests/kernel/test_to_integer.py diff --git a/python/tests/kernel/test_to_integer.py b/python/tests/kernel/test_to_integer.py new file mode 100644 index 00000000000..959341a3698 --- /dev/null +++ b/python/tests/kernel/test_to_integer.py @@ -0,0 +1,41 @@ +# ============================================================================ # +# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # +import pytest +import os +import cudaq + +def testToInteger(): + @cudaq.kernel + def toIntegerKernel(applyX: list[int]) -> int: + q = cudaq.qvector(len(applyX)) + for i in range(len(applyX)): + if applyX[i]: + x(q[i]) + return cudaq.to_integer(mz(q)) + + test_cases = [ + [1, 1, 1], + [1, 1, 1, 1], + [1, 0, 1], + [1, 0, 0, 0], + [0, 0, 0, 1], + ] + + # See reference: targettests/execution/to_integer.cpp + expected_results = [7, 15, 5, 1, 8] + for applyX in test_cases: + counts = cudaq.run(toIntegerKernel, applyX) + # All shots should yield the same integer result + for result in counts: + assert result == expected_results[test_cases.index(applyX)] + + +# leave for gdb debugging +if __name__ == "__main__": + loc = os.path.abspath(__file__) + pytest.main([loc, "-rP"]) \ No newline at end of file From d06fb9333034cc048a5c18f3f6673742b008e198 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 22 Oct 2025 02:50:53 +0000 Subject: [PATCH 5/8] Unblock all Python run tests for returning vectors Signed-off-by: Thien Nguyen --- lib/Optimizer/Builder/Intrinsics.cpp | 130 ++++++++++++ lib/Optimizer/CodeGen/Pipelines.cpp | 1 + lib/Optimizer/CodeGen/ReturnToOutputLog.cpp | 155 +++++++++++++- python/cudaq/kernel/ast_bridge.py | 34 +-- python/tests/kernel/test_return_vectors.py | 132 ------------ python/tests/kernel/test_run_async_kernel.py | 149 +++++++------ python/tests/kernel/test_run_kernel.py | 208 +++++++++++++------ 7 files changed, 524 insertions(+), 285 deletions(-) delete mode 100644 python/tests/kernel/test_return_vectors.py diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp index fd7622981fb..2931adefae7 100644 --- a/lib/Optimizer/Builder/Intrinsics.cpp +++ b/lib/Optimizer/Builder/Intrinsics.cpp @@ -371,6 +371,136 @@ static constexpr IntrinsicCode intrinsicTable[] = { {cudaq::stdvecBoolCtorFromInitList, {}, R"#( func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -> ())#"}, + {"__nvqpp_internal_number_of_digits", {}, R"#( + func.func private @__nvqpp_internal_number_of_digits(%arg0: i64) -> i64 { + %c10_i64 = arith.constant 10 : i64 + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %0 = cc.alloca i64 + cc.store %arg0, %0 : !cc.ptr + %1 = cc.load %0 : !cc.ptr + %2 = cc.alloca i64 + cc.store %c0_i64, %2 : !cc.ptr + %3 = arith.cmpi eq, %1, %c0_i64 : i64 + cc.if(%3) { + cc.store %c1_i64, %2 : !cc.ptr + } + cc.loop while { + %5 = cc.load %0 : !cc.ptr + %6 = arith.cmpi sgt, %5, %c0_i64 : i64 + cc.condition %6 + } do { + %5 = cc.load %0 : !cc.ptr + %6 = arith.divsi %5, %c10_i64 : i64 + cc.store %6, %0 : !cc.ptr + %7 = cc.load %2 : !cc.ptr + %8 = arith.addi %7, %c1_i64 : i64 + cc.store %8, %2 : !cc.ptr + cc.continue + } + %4 = cc.load %2 : !cc.ptr + return %4 : i64 + } + )#"}, + + // __nvqpp_internal_tostring + {"__nvqpp_internal_tostring", {}, R"#( + func.func private @__nvqpp_internal_tostring(%buf: !cc.stdvec, %val: i64) { + %c48_i64 = arith.constant 48 : i64 + %c48_i32 = arith.constant 48 : i32 + %c0_i64 = arith.constant 0 : i64 + %c10_i64 = arith.constant 10 : i64 + %c1_i64 = arith.constant 1 : i64 + %c48_i8 = arith.constant 48 : i8 + %false = arith.constant false + %c0_i8 = arith.constant 0 : i8 + %0 = cc.alloca i64 + cc.store %val, %0 : !cc.ptr + %1 = cc.alloca i64 + cc.store %c10_i64, %1 : !cc.ptr + %2 = cc.stdvec_size %buf : (!cc.stdvec) -> i64 + %3 = cc.alloca i64 + cc.store %2, %3 : !cc.ptr + %4 = cc.load %3 : !cc.ptr + %5 = arith.subi %4, %c1_i64 : i64 + %6 = cc.alloca i64 + cc.store %5, %6 : !cc.ptr + %7 = cc.load %6 : !cc.ptr + %8 = cc.stdvec_data %buf : (!cc.stdvec) -> !cc.ptr> + %9 = cc.compute_ptr %8[%7] : (!cc.ptr>, i64) -> !cc.ptr + cc.store %c0_i8, %9 : !cc.ptr + %10 = cc.load %6 : !cc.ptr + %11 = arith.subi %10, %c1_i64 : i64 + cc.store %11, %6 : !cc.ptr + cc.loop while { + %18 = cc.load %0 : !cc.ptr + %19 = cc.load %1 : !cc.ptr + %20 = arith.cmpi sge, %18, %19 : i64 + %21 = arith.cmpi eq, %20, %false : i1 + %22 = cc.if(%21) -> i1 { + cc.continue %false : i1 + } else { + %23 = cc.load %6 : !cc.ptr + %24 = arith.cmpi sge, %23, %c0_i64 : i64 + cc.continue %24 : i1 + } + cc.condition %22 + } do { + cc.scope { + %18 = cc.load %0 : !cc.ptr + %19 = cc.load %1 : !cc.ptr + %20 = arith.remsi %18, %19 : i64 + %21 = cc.cast %20 : (i64) -> i32 + %22 = cc.alloca i32 + cc.store %21, %22 : !cc.ptr + %23 = cc.load %1 : !cc.ptr + %24 = cc.load %0 : !cc.ptr + %25 = arith.divsi %24, %23 : i64 + cc.store %25, %0 : !cc.ptr + %26 = cc.load %6 : !cc.ptr + %27 = cc.stdvec_data %buf : (!cc.stdvec) -> !cc.ptr> + %28 = cc.compute_ptr %27[%26] : (!cc.ptr>, i64) -> !cc.ptr + %29 = cc.load %22 : !cc.ptr + %30 = arith.addi %29, %c48_i32 : i32 + %31 = cc.cast %30 : (i32) -> i8 + cc.store %31, %28 : !cc.ptr + %32 = cc.load %6 : !cc.ptr + %33 = arith.subi %32, %c1_i64 : i64 + cc.store %33, %6 : !cc.ptr + } + cc.continue + } + %12 = cc.load %6 : !cc.ptr + %13 = cc.stdvec_data %buf : (!cc.stdvec) -> !cc.ptr> + %14 = cc.compute_ptr %13[%12] : (!cc.ptr>, i64) -> !cc.ptr + %15 = cc.load %0 : !cc.ptr + %16 = arith.addi %15, %c48_i64 : i64 + %17 = cc.cast %16 : (i64) -> i8 + cc.store %17, %14 : !cc.ptr + cc.scope { + %18 = cc.alloca i64 + cc.store %c0_i64, %18 : !cc.ptr + cc.loop while { + %19 = cc.load %18 : !cc.ptr + %20 = cc.load %6 : !cc.ptr + %21 = arith.cmpi slt, %19, %20 : i64 + cc.condition %21 + } do { + %19 = cc.load %18 : !cc.ptr + %20 = cc.stdvec_data %buf : (!cc.stdvec) -> !cc.ptr> + %21 = cc.compute_ptr %20[%19] : (!cc.ptr>, i64) -> !cc.ptr + cc.store %c48_i8, %21 : !cc.ptr + cc.continue + } step { + %19 = cc.load %18 : !cc.ptr + %20 = arith.addi %19, %c1_i64 : i64 + cc.store %20, %18 : !cc.ptr + } + } + return + } + )#"}, + // This helper function copies a buffer off the stack to the heap. This is // required when the data on the stack is about to go out of scope but is // still live. diff --git a/lib/Optimizer/CodeGen/Pipelines.cpp b/lib/Optimizer/CodeGen/Pipelines.cpp index c4d0141afd4..a8a3f918968 100644 --- a/lib/Optimizer/CodeGen/Pipelines.cpp +++ b/lib/Optimizer/CodeGen/Pipelines.cpp @@ -98,6 +98,7 @@ void createTargetCodegenPipeline(PassManager &pm, pm.addNestedPass(createCSEPass()); ::addQIRConversionPipeline(pm, options.target); pm.addPass(cudaq::opt::createReturnToOutputLog()); + cudaq::opt::addLowerToCFG(pm); pm.addPass(createConvertMathToFuncs()); pm.addPass(createSymbolDCEPass()); pm.addPass(cudaq::opt::createCCToLLVM()); diff --git a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp index b4b175a31dd..09f280ff255 100644 --- a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp +++ b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp @@ -46,7 +46,8 @@ class ReturnRewrite : public OpRewritePattern { } static void genOutputLog(Location loc, PatternRewriter &rewriter, Value val, - std::optional prefix) { + std::optional prefix, + std::optional customLabel = std::nullopt) { Type valTy = val.getType(); TypeSwitch(valTy) .Case([&](IntegerType intTy) { @@ -54,7 +55,8 @@ class ReturnRewrite : public OpRewritePattern { std::string labelStr = std::string("i") + std::to_string(width); if (prefix) labelStr = prefix->str(); - Value label = makeLabel(loc, rewriter, labelStr); + Value label = + customLabel.value_or(makeLabel(loc, rewriter, labelStr)); if (intTy.getWidth() == 1) { rewriter.create(loc, TypeRange{}, cudaq::opt::QIRBoolRecordOutput, @@ -80,7 +82,8 @@ class ReturnRewrite : public OpRewritePattern { std::string labelStr = std::string("f") + std::to_string(width); if (prefix) labelStr = prefix->str(); - Value label = makeLabel(loc, rewriter, labelStr); + Value label = + customLabel.value_or(makeLabel(loc, rewriter, labelStr)); // Floating point: convert it to double, whatever it actually is. Value castVal = val; if (floatTy != rewriter.getF64Type()) @@ -94,7 +97,8 @@ class ReturnRewrite : public OpRewritePattern { auto labelStr = translateType(structTy); if (prefix) labelStr = prefix->str(); - Value label = makeLabel(loc, rewriter, labelStr); + Value label = + customLabel.value_or(makeLabel(loc, rewriter, labelStr)); std::int32_t sz = structTy.getNumMembers(); Value size = rewriter.create(loc, sz, 64); rewriter.create(loc, TypeRange{}, @@ -111,7 +115,8 @@ class ReturnRewrite : public OpRewritePattern { }) .Case([&](cudaq::cc::ArrayType arrTy) { auto labelStr = translateType(arrTy); - Value label = makeLabel(loc, rewriter, labelStr); + Value label = + customLabel.value_or(makeLabel(loc, rewriter, labelStr)); std::int32_t sz = arrTy.getSize(); Value size = rewriter.create(loc, sz, 64); rewriter.create(loc, TypeRange{}, @@ -128,13 +133,12 @@ class ReturnRewrite : public OpRewritePattern { } }) .Case([&](cudaq::cc::StdvecType vecTy) { - // For this type, we expect a cc.stdvec_init operation as the input. - // The data will be in a variable. - // If we reach here and we cannot determine the constant size of the - // buffer, then we will not generate any output logging. if (auto vecInit = val.getDefiningOp()) if (auto maybeLen = cudaq::opt::factory::maybeValueOfIntConstant( vecInit.getLength())) { + // For this type, we expect a cc.stdvec_init operation as the + // input. + // The data will be in a variable. std::int32_t sz = *maybeLen; auto labelStr = translateType(vecTy, sz); Value label = makeLabel(loc, rewriter, labelStr); @@ -159,6 +163,53 @@ class ReturnRewrite : public OpRewritePattern { genOutputLog(loc, rewriter, w, offset); } } + + // If we reach here and we cannot determine the constant size of the + // buffer, then we will not generate dynamic output logging with a for + // loop. + Value vecSz = rewriter.template create( + loc, rewriter.getI64Type(), val); + const std::string arrayLabelPrefix = + "array<" + translateType(vecTy.getElementType()) + " x "; + Value labelBuffer = + makeLabel(loc, rewriter, arrayLabelPrefix, vecSz, ">"); + rewriter.create(loc, TypeRange{}, + cudaq::opt::QIRArrayRecordOutput, + ArrayRef{vecSz, labelBuffer}); + auto eleTy = vecTy.getElementType(); + const bool isBool = (eleTy == rewriter.getI1Type()); + if (isBool) + eleTy = rewriter.getI8Type(); + auto elePtrTy = cudaq::cc::PointerType::get(eleTy); + auto eleArrTy = + cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy)); + auto vecPtr = + rewriter.create(loc, eleArrTy, val); + const std::string preStr = prefix ? prefix->str() : std::string{}; + cudaq::opt::factory::createInvariantLoop( + rewriter, loc, vecSz, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value indexVar = block.getArgument(0); + auto eleAddr = rewriter.create( + loc, elePtrTy, vecPtr, ValueRange{indexVar}); + + Value w = [&]() { + if (isBool) { + auto i1PtrTy = + cudaq::cc::PointerType::get(rewriter.getI1Type()); + auto i1Cast = rewriter.create( + loc, i1PtrTy, eleAddr); + return rewriter.create(loc, i1Cast); + } + + return rewriter.create(loc, eleAddr); + }(); + const std::string prefix = preStr + "["; + const std::string postfix = "]"; + Value dynamicLabel = + makeLabel(loc, rewriter, prefix, indexVar, postfix); + genOutputLog(loc, rewriter, w, std::nullopt, dynamicLabel); + }); }) .Default([&](Type) { // If we reach here, we don't know how to handle this type. @@ -207,6 +258,79 @@ class ReturnRewrite : public OpRewritePattern { auto i8PtrTy = cudaq::cc::PointerType::get(rewriter.getI8Type()); return rewriter.create(loc, i8PtrTy, lit); } + + static Value makeLabel(Location loc, PatternRewriter &rewriter, + const std::string &prefix, Value val, + const std::string &postFix) { + auto i64Ty = rewriter.getI64Type(); + auto i8Ty = rewriter.getI8Type(); + auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty); + // Value must be i64 + if (val.getType() != i64Ty) + val = rewriter.create(loc, i64Ty, val); + // Compute the number of digits required + Value numDigits = rewriter + .create( + loc, i64Ty, "__nvqpp_internal_number_of_digits", + ArrayRef{val}) + .getResult(0); + Value valStrBuf = [&]() { + // Convert integer value to string + auto strSize = rewriter.create( + loc, numDigits, + rewriter.create(loc, 1, + 64)); // Add null terminator + auto buffer = rewriter.create(loc, i8Ty, strSize); + auto stdvecTy = cudaq::cc::StdvecType::get(i8Ty); + auto stringCharVec = rewriter.create( + loc, stdvecTy, buffer, strSize); + rewriter.create(loc, TypeRange{}, + "__nvqpp_internal_tostring", + ArrayRef{stringCharVec, val}); + return rewriter.create(loc, i8PtrTy, buffer); + }(); + + Value arrayPrefix = makeLabel(loc, rewriter, prefix); + Value arrayPostfix = makeLabel(loc, rewriter, postFix); + const int preFixLen = prefix.size(); + const int postFixLen = postFix.size(); + Value totalStrSize = rewriter.create( + loc, numDigits, + rewriter.create(loc, preFixLen + postFixLen + 1, + 64)); + auto labelBufferAlloc = + rewriter.create(loc, i8Ty, totalStrSize); + Value labelBuffer = + rewriter.create(loc, i8PtrTy, labelBufferAlloc); + + // Copy the prefix + rewriter.create( + loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{labelBuffer, arrayPrefix, + rewriter.create(loc, preFixLen, 64), + rewriter.create(loc, 0, 1)}); + // Copy the integer string + auto toPtr = rewriter.create( + loc, i8PtrTy, labelBufferAlloc, + ValueRange{rewriter.create(loc, preFixLen, 64)}); + rewriter.create( + loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{toPtr, valStrBuf, numDigits, + rewriter.create(loc, 0, 1)}); + // Copy the postfix + null terminator + Value shift = rewriter.create( + loc, numDigits, + rewriter.create(loc, preFixLen, 64)); + toPtr = rewriter.create( + loc, i8PtrTy, labelBufferAlloc, ValueRange{shift}); + rewriter.create( + loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{ + toPtr, arrayPostfix, + rewriter.create(loc, postFixLen + 1, 64), + rewriter.create(loc, 0, 1)}); + return labelBuffer; + } }; struct ReturnToOutputLogPass @@ -230,6 +354,19 @@ struct ReturnToOutputLogPass return; } + if (failed(irBuilder.loadIntrinsic(module, "__nvqpp_internal_tostring"))) { + module.emitError("could not load string conversion function."); + signalPassFailure(); + return; + } + + if (failed(irBuilder.loadIntrinsic(module, + "__nvqpp_internal_number_of_digits"))) { + module.emitError("could not load number of digits function."); + signalPassFailure(); + return; + } + RewritePatternSet patterns(ctx); patterns.insert(ctx); LLVM_DEBUG(llvm::dbgs() << "Before return to output logging:\n" << module); diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py index 59665ef9292..3415810819d 100644 --- a/python/cudaq/kernel/ast_bridge.py +++ b/python/cudaq/kernel/ast_bridge.py @@ -2430,6 +2430,23 @@ def bodyBuilder(iterVal): # kernel registry correctly for the next conditional check if var.name in globalKernelRegistry: node.func.id = var.name + # Check generic callable objects that may be C++ `qkernel` (with its MLIR code registered) + elif hasattr(var, '__call__'): + # This is a callable object, which could be a C++ kernel + # Get the full module + name key and see if it is registered + modulePath = str(var.__module__) if hasattr( + var, '__module__') else '' + funcName = str(var.__name__) if hasattr( + var, '__name__') else '' + devKey = f"{modulePath}.{funcName}" + if cudaq_runtime.isRegisteredDeviceModule(devKey): + maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( + self.module, devKey) + if maybeKernelName != None: + otherKernel = SymbolTable( + self.module.operation)[maybeKernelName] + processFunctionCall(otherKernel.type, len(node.args)) + return if node.func.id in globalKernelRegistry: # If in `globalKernelRegistry`, it has to be in this Module @@ -2554,23 +2571,6 @@ def bodyBuilder(iterVal): cc.StoreOp(ctorArgs[i], eleAddr) self.pushValue(stackSlot) return - # Check generic callable objects that may be C++ kernels - elif hasattr(var, '__call__'): - # This is a callable object, which could be a C++ kernel - # Get the full module + name key and see if it is registered - modulePath = str(var.__module__) if hasattr( - var, '__module__') else '' - funcName = str(var.__name__) if hasattr( - var, '__name__') else '' - devKey = f"{modulePath}.{funcName}" - if cudaq_runtime.isRegisteredDeviceModule(devKey): - maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( - self.module, devKey) - if maybeKernelName != None: - otherKernel = SymbolTable( - self.module.operation)[maybeKernelName] - processFunctionCall(otherKernel.type, len(node.args)) - return else: self.emitFatalError( "unhandled function call - {}, known kernels are {}".format( diff --git a/python/tests/kernel/test_return_vectors.py b/python/tests/kernel/test_return_vectors.py deleted file mode 100644 index 376aab05613..00000000000 --- a/python/tests/kernel/test_return_vectors.py +++ /dev/null @@ -1,132 +0,0 @@ -import cudaq -import pytest -import os - - -def testReturnVectorBool(): - - @cudaq.kernel - def return_vec_bool() -> list[bool]: - ret = [True, False] - return ret - - res = cudaq.run(return_vec_bool, shots_count=1) - assert res == [[True, False]] - - @cudaq.kernel - def return_vec_bool_from_measure() -> list[bool]: - q = cudaq.qvector(5) - x(q) - ret = mz(q) - return ret - - res = cudaq.run(return_vec_bool_from_measure, shots_count=10) - assert res == [[True] * 5] * 10 - - @cudaq.kernel - def return_vec_bool_from_measure_mix() -> list[bool]: - q = cudaq.qvector(5) - for i in range(5): - if i % 2 == 0: - x(q[i]) - ret = mz(q) - return ret - - res = cudaq.run(return_vec_bool_from_measure_mix, shots_count=10) - assert res == [[True, False, True, False, True]] * 10 - - -def testReturnVectorInt(): - - @cudaq.kernel - def return_vec_int() -> list[int]: - ret = [1, 2, 3] - return ret - - res = cudaq.run(return_vec_int, shots_count=1) - assert res == [[1, 2, 3]] - - @cudaq.kernel - def return_vec_int_from_measure() -> list[int]: - q = cudaq.qvector(5) - x(q) - ret = mz(q) - int_ret = [0 for b in ret] - i = 0 - for b in ret: - if b: - int_ret[i] = 6 - i += 1 - return int_ret - - res = cudaq.run(return_vec_int_from_measure, shots_count=10) - assert res == [[6] * 5] * 10 - - @cudaq.kernel - def return_vec_int_from_measure_mix() -> list[int]: - q = cudaq.qvector(5) - for i in range(5): - if i % 2 == 0: - x(q[i]) - ret = mz(q) - int_ret = [0 for b in ret] - i = 0 - for b in ret: - if b: - int_ret[i] = 6 - i += 1 - return int_ret - - res = cudaq.run(return_vec_int_from_measure_mix, shots_count=10) - assert res == [[6, 0, 6, 0, 6]] * 10 - - -def testReturnVectorFloat(): - - @cudaq.kernel - def return_vec_float() -> list[float]: - ret = [1.1, 2.2, 3.3] - return ret - - res = cudaq.run(return_vec_float, shots_count=1) - assert res == [[1.1, 2.2, 3.3]] - - @cudaq.kernel - def return_vec_float_from_measure() -> list[float]: - q = cudaq.qvector(5) - x(q) - ret = mz(q) - float_ret = [0.0 for b in ret] - i = 0 - for b in ret: - if b: - float_ret[i] = 6.6 - i += 1 - return float_ret - - res = cudaq.run(return_vec_float_from_measure, shots_count=10) - assert res == [[6.6] * 5] * 10 - - @cudaq.kernel - def return_vec_float_from_measure_mix() -> list[float]: - q = cudaq.qvector(5) - for i in range(5): - if i % 2 == 0: - x(q[i]) - ret = mz(q) - float_ret = [0.0 for b in ret] - i = 0 - for b in ret: - if b: - float_ret[i] = 6.6 - i += 1 - return float_ret - - res = cudaq.run(return_vec_float_from_measure_mix, shots_count=10) - assert res == [[6.6, 0.0, 6.6, 0.0, 6.6]] * 10 - - -# leave for gdb debugging -if __name__ == "__main__": - loc = os.path.abspath(__file__) - pytest.main([loc, "-s"]) diff --git a/python/tests/kernel/test_run_async_kernel.py b/python/tests/kernel/test_run_async_kernel.py index fc1c0ac3aae..31796c18e15 100644 --- a/python/tests/kernel/test_run_async_kernel.py +++ b/python/tests/kernel/test_run_async_kernel.py @@ -14,8 +14,6 @@ import numpy as np import pytest -list_err_msg = 'does not yet support returning `list` from entry-point kernels' - def is_close(actual, expected): return np.isclose(actual, expected, atol=1e-6) @@ -338,38 +336,41 @@ def test_return_list_bool(): def simple_list_bool_no_args() -> list[bool]: return [True, False, True] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_bool_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_bool_no_args, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] @cudaq.kernel def simple_list_bool(n: int) -> list[bool]: qubits = cudaq.qvector(n) return [True, False, True] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_bool, 2, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_bool, 2, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] @cudaq.kernel def simple_list_bool_args(n: int, t: list[bool]) -> list[bool]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_bool_args, 2, [True, False, True]).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_bool_args, 2, [True, False, True], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] @cudaq.kernel def simple_list_bool_args_no_broadcast(t: list[bool]) -> list[bool]: qubits = cudaq.qvector(2) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_bool_args_no_broadcast, - [True, False, True]).get() - assert list_err_msg in str(e.value) - + results = cudaq.run_async(simple_list_bool_args_no_broadcast, + [True, False, True], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] def test_return_list_int(): @@ -377,18 +378,21 @@ def test_return_list_int(): def simple_list_int_no_args() -> list[int]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int_no_args, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int(n: int, t: list[int]) -> list[int]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int, 2, [-13, 5, 42], shots_count=2).get() - assert list_err_msg in str(e.value) + + results = cudaq.run_async(simple_list_int, 2, [-13, 5, 42], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int8(): @@ -397,18 +401,22 @@ def test_return_list_int8(): def simple_list_int8_no_args() -> list[np.int8]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int8_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + + results = cudaq.run_async(simple_list_int8_no_args, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int8(n: int, t: list[np.int8]) -> list[np.int8]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int8, 2, [-13, 5, 42], shots_count=2).get() - assert list_err_msg in str(e.value) + + results = cudaq.run_async(simple_list_int8, 2, [-13, 5, 42], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int16(): @@ -417,18 +425,20 @@ def test_return_list_int16(): def simple_list_int16_no_args() -> list[np.int16]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int16_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int16_no_args, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int16(n: int, t: list[np.int16]) -> list[np.int16]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int16, 2, [-13, 5, 42], shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int16, 2, [-13, 5, 42], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int32(): @@ -437,18 +447,20 @@ def test_return_list_int32(): def simple_list_int32_no_args() -> list[np.int32]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int32_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int32_no_args, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int32(n: int, t: list[np.int32]) -> list[np.int32]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int32, 2, [-13, 5, 42], shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int32, 2, [-13, 5, 42], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int64(): @@ -457,18 +469,20 @@ def test_return_list_int64(): def simple_list_int64_no_args() -> list[np.int64]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int64_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int64_no_args, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int64(n: int, t: list[np.int64]) -> list[np.int64]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int64, 2, [-13, 5, 42], shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int64, 2, [-13, 5, 42], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_float(): @@ -477,20 +491,22 @@ def test_return_list_float(): def simple_list_float_no_args() -> list[float]: return [-13.2, 5., 42.99] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_float_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_float_no_args, shots_count=2).get() + assert len(results) == 2 + assert np.allclose(results[0], [-13.2, 5., 42.99]) + assert np.allclose(results[1], [-13.2, 5., 42.99]) @cudaq.kernel def simple_list_float(n: int, t: list[float]) -> list[float]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_float, + results = cudaq.run_async(simple_list_float, 2, [-13.2, 5.0, 42.99], shots_count=2).get() - assert list_err_msg in str(e.value) + assert len(results) == 2 + assert np.allclose(results[0], [-13.2, 5., 42.99]) + assert np.allclose(results[1], [-13.2, 5., 42.99]) def test_return_list_float32(): @@ -499,20 +515,22 @@ def test_return_list_float32(): def simple_list_float32_no_args() -> list[np.float32]: return [-13.2, 5., 42.99] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_float32_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_float32_no_args, shots_count=2).get() + assert len(results) == 2 + assert np.allclose(results[0], [-13.2, 5., 42.99]) + assert np.allclose(results[1], [-13.2, 5., 42.99]) @cudaq.kernel def simple_list_float32(n: int, t: list[np.float32]) -> list[np.float32]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_float32, + results = cudaq.run_async(simple_list_float32, 2, [-13.2, 5.0, 42.99], shots_count=2).get() - assert list_err_msg in str(e.value) + assert len(results) == 2 + assert np.allclose(results[0], [-13.2, 5., 42.99]) + assert np.allclose(results[1], [-13.2, 5., 42.99]) def test_return_list_float64(): @@ -521,21 +539,22 @@ def test_return_list_float64(): def simple_list_float64_no_args() -> list[np.float64]: return [-13.2, 5., 42.99] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_float64_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_float64_no_args, shots_count=2).get() + assert len(results) == 2 + assert np.allclose(results[0], [-13.2, 5., 42.99]) + assert np.allclose(results[1], [-13.2, 5., 42.99]) @cudaq.kernel def simple_list_float64(n: int, t: list[np.float64]) -> list[np.float64]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_float64, + results = cudaq.run_async(simple_list_float64, 2, [-13.2, 5.0, 42.99], shots_count=2).get() - assert list_err_msg in str(e.value) - + assert len(results) == 2 + assert np.allclose(results[0], [-13.2, 5., 42.99]) + assert np.allclose(results[1], [-13.2, 5., 42.99]) # Test tuples # TODO: Define spec for using tuples in kernels diff --git a/python/tests/kernel/test_run_kernel.py b/python/tests/kernel/test_run_kernel.py index 3e656ee16a9..d73b35fa352 100644 --- a/python/tests/kernel/test_run_kernel.py +++ b/python/tests/kernel/test_run_kernel.py @@ -14,8 +14,6 @@ import warnings import pytest -list_err_msg = 'does not yet support returning `list` from entry-point kernels' - skipIfBraketNotInstalled = pytest.mark.skipif( not (cudaq.has_target("braket")), reason='Could not find `braket` in installation') @@ -333,36 +331,41 @@ def test_return_list_bool(): def simple_list_bool_no_args() -> list[bool]: return [True, False, True] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_bool_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_bool_no_args, shots_count=2) + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] @cudaq.kernel def simple_list_bool(n: int) -> list[bool]: qubits = cudaq.qvector(n) return [True, False, True] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_bool, 2, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_bool, 2, shots_count=2) + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] @cudaq.kernel def simple_list_bool_args(n: int, t: list[bool]) -> list[bool]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_bool_args, 2, [True, False, True]) - assert list_err_msg in str(e.value) + + results = cudaq.run(simple_list_bool_args, 2, [True, False, True], shots_count=2) + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] @cudaq.kernel def simple_list_bool_args_no_broadcast(t: list[bool]) -> list[bool]: qubits = cudaq.qvector(2) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_bool_args_no_broadcast, [True, False, True]) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_bool_args_no_broadcast, [True, False, True], shots_count=2) + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] def test_return_list_int(): @@ -371,18 +374,20 @@ def test_return_list_int(): def simple_list_int_no_args() -> list[int]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int_no_args, shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int(n: int, t: list[int]) -> list[int]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int, 2, [-13, 5, 42], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int, 2, [-13, 5, 42], shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int8(): @@ -391,18 +396,21 @@ def test_return_list_int8(): def simple_list_int8_no_args() -> list[np.int8]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int8_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int8_no_args, shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] + @cudaq.kernel def simple_list_int8(n: int, t: list[np.int8]) -> list[np.int8]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int8, 2, [-13, 5, 42], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int8, 2, [-13, 5, 42], shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int16(): @@ -411,18 +419,21 @@ def test_return_list_int16(): def simple_list_int16_no_args() -> list[np.int16]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int16_no_args, shots_count=2) - assert list_err_msg in str(e.value) + + results = cudaq.run(simple_list_int16_no_args, shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int16(n: int, t: list[np.int16]) -> list[np.int16]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int16, 2, [-13, 5, 42], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int16, 2, [-13, 5, 42], shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int32(): @@ -431,18 +442,20 @@ def test_return_list_int32(): def simple_list_int32_no_args() -> list[np.int32]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int32_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int32_no_args, shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int32(n: int, t: list[np.int32]) -> list[np.int32]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int32, 2, [-13, 5, 42], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int32, 2, [-13, 5, 42], shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int64(): @@ -451,18 +464,20 @@ def test_return_list_int64(): def simple_list_int64_no_args() -> list[np.int64]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int64_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int64_no_args, shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int64(n: int, t: list[np.int64]) -> list[np.int64]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int64, 2, [-13, 5, 42], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int64, 2, [-13, 5, 42], shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_float(): @@ -471,18 +486,20 @@ def test_return_list_float(): def simple_list_float_no_args() -> list[float]: return [-13.2, 5., 42.99] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_float_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_float_no_args, shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], [-13.2, 5., 42.99]) + assert is_close_array(results[1], [-13.2, 5., 42.99]) @cudaq.kernel def simple_list_float(n: int, t: list[float]) -> list[float]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_float, 2, [-13.2, 5.0, 42.99], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_float, 2, [-13.2, 5.0, 42.99], shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], [-13.2, 5., 42.99]) + assert is_close_array(results[1], [-13.2, 5., 42.99]) def test_return_list_float32(): @@ -491,18 +508,20 @@ def test_return_list_float32(): def simple_list_float32_no_args() -> list[np.float32]: return [-13.2, 5., 42.99] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_float32_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_float32_no_args, shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], [-13.2, 5., 42.99]) + assert is_close_array(results[1], [-13.2, 5., 42.99]) @cudaq.kernel def simple_list_float32(n: int, t: list[np.float32]) -> list[np.float32]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_float32, 2, [-13.2, 5.0, 42.99], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_float32, 2, [-13.2, 5.0, 42.99], shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], [-13.2, 5., 42.99]) + assert is_close_array(results[1], [-13.2, 5., 42.99]) def test_return_list_float64(): @@ -511,19 +530,84 @@ def test_return_list_float64(): def simple_list_float64_no_args() -> list[np.float64]: return [-13.2, 5., 42.99] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_float64_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_float64_no_args, shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], [-13.2, 5., 42.99]) + assert is_close_array(results[1], [-13.2, 5., 42.99]) @cudaq.kernel def simple_list_float64(n: int, t: list[np.float64]) -> list[np.float64]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_float64, 2, [-13.2, 5.0, 42.99], shots_count=2) - assert list_err_msg in str(e.value) + + results = cudaq.run(simple_list_float64, 2, [-13.2, 5.0, 42.99], shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], [-13.2, 5., 42.99]) + assert is_close_array(results[1], [-13.2, 5., 42.99]) +def test_return_list_large_size(): + # Returns a large list (dynamic size) to stress test the code generation + + @cudaq.kernel + def kernel_with_dynamic_int_array_input(n: int, t: list[int]) -> list[int]: + qubits = cudaq.qvector(n) + return t + + @cudaq.kernel + def kernel_with_dynamic_float_array_input(n: int, t: list[float]) -> list[float]: + qubits = cudaq.qvector(n) + return t + + @cudaq.kernel + def kernel_with_dynamic_bool_array_input(n: int, t: list[bool]) -> list[bool]: + qubits = cudaq.qvector(n) + return t + + # Test with various sizes (validate dynamic output logging) + for array_size in [10, 15, 100, 167, 1000]: + input_array = list(np.random.randint(-1000, 1000, size=array_size)) + results = cudaq.run(kernel_with_dynamic_int_array_input, 2, input_array, shots_count=2) + assert len(results) == 2 + assert results[0] == input_array + assert results[1] == input_array + + input_array_float = list(np.random.uniform(-1000.0, 1000.0, size=array_size)) + results = cudaq.run(kernel_with_dynamic_float_array_input, 2, input_array_float, shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], input_array_float) + assert is_close_array(results[1], input_array_float) + + input_array_bool = [] + for _ in range(array_size): + input_array_bool.append(True if np.random.rand() > 0.5 else False) + results = cudaq.run(kernel_with_dynamic_bool_array_input, 2, input_array_bool, shots_count=2) + assert len(results) == 2 + assert results[0] == input_array_bool + assert results[1] == input_array_bool + +def test_return_dynamics_measure_results(): + @cudaq.kernel + def measure_all_qubits(numQubits: int) -> list[bool]: + # Number of qubits is dynamic + qubits = cudaq.qvector(numQubits) + for i in range(numQubits): + if i % 2 == 0: + x(qubits[i]) + + return mz(qubits) + + for numQubits in [1, 3, 5, 11, 20]: + shots = 2 + results = cudaq.run(measure_all_qubits, numQubits, shots_count=shots) + assert len(results) == shots + for res in results: + assert len(res) == numQubits + for i in range(numQubits): + if i % 2 == 0: + assert res[i] == True + else: + assert res[i] == False # Test tuples # TODO: Define spec for using tuples in kernels From 03e5836d525b9eea65e57597ac2971f1a80b3d90 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 22 Oct 2025 06:06:43 +0000 Subject: [PATCH 6/8] Fix tests Signed-off-by: Thien Nguyen --- lib/Optimizer/CodeGen/ReturnToOutputLog.cpp | 1 + python/tests/mlir/ast_list_comprehension.py | 44 ++++++++++++--------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp index 09f280ff255..01665222154 100644 --- a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp +++ b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp @@ -162,6 +162,7 @@ class ReturnRewrite : public OpRewritePattern { Value w = rewriter.create(loc, v); genOutputLog(loc, rewriter, w, offset); } + return; } // If we reach here and we cannot determine the constant size of the diff --git a/python/tests/mlir/ast_list_comprehension.py b/python/tests/mlir/ast_list_comprehension.py index ba3e936db4c..e0051bf28e7 100644 --- a/python/tests/mlir/ast_list_comprehension.py +++ b/python/tests/mlir/ast_list_comprehension.py @@ -55,10 +55,12 @@ def kernel3() -> float: # CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel1() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} -# CHECK: %[[VAL_0:.*]] = arith.constant true -# CHECK: %[[VAL_1:.*]] = cc.alloca !cc.array -# CHECK: %[[VAL_2:.*]] = cc.compute_ptr %[[VAL_1]][{{.*}}] : (!cc.ptr>, i64) -> !cc.ptr -# CHECK: cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr +# CHECK: %[[VAL_0:.*]] = arith.constant 1 : i8 +# CHECK: %[[VAL_1:.*]] = cc.alloca !cc.array +# CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr>) -> !cc.ptr> +# CHECK: %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_1]][{{.*}}] : (!cc.ptr>, i64) -> !cc.ptr +# CHECK: cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr +# CHECK: %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_2]], %c5_i64 : (!cc.ptr>, i64) -> !cc.stdvec # CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel2() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"} # CHECK: %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64 @@ -122,10 +124,12 @@ def kernel3() -> float: # CHECK: %[[VAL_0:.*]] = arith.constant true # CHECK: %[[VAL_1:.*]] = cc.alloca i1 # CHECK: cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr -# CHECK: %[[VAL_2:.*]] = cc.alloca !cc.array -# CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr -# CHECK: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr>, i64) -> !cc.ptr -# CHECK: cc.store %[[VAL_3]], %[[VAL_4]] : !cc.ptr +# CHECK: %[[VAL_2:.*]] = cc.alloca !cc.array +# CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr>) -> !cc.ptr> +# CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_1]] : !cc.ptr +# CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr>, i64) -> !cc.ptr +# CHECK: %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i8 +# CHECK: cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr # CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel2() -> i64 attributes {"cudaq-entrypoint", "cudaq-kernel"} # CHECK: %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64 @@ -198,10 +202,12 @@ def kernel3() -> float: # CHECK: %[[VAL_0:.*]] = arith.constant true # CHECK: %[[VAL_1:.*]] = cc.alloca i1 # CHECK: cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr -# CHECK: %[[VAL_2:.*]] = cc.alloca !cc.array -# CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr -# CHECK: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr>, i64) -> !cc.ptr -# CHECK: cc.store %[[VAL_3]], %[[VAL_4]] : !cc.ptr +# CHECK: %[[VAL_2:.*]] = cc.alloca !cc.array +# CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr>) -> !cc.ptr> +# CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_1]] : !cc.ptr +# CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr>, i64) -> !cc.ptr +# CHECK: %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i8 +# CHECK: cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr # CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel2() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"} # CHECK: %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64 @@ -271,14 +277,14 @@ def kernel3() -> float: # CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel1() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} -# CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 -# CHECK: %[[VAL_1:.*]] = arith.constant true +# CHECK: %[[VAL_0:.*]] = arith.constant 1 : i8 +# CHECK: %[[VAL_1:.*]] = arith.constant 1 : i64 # CHECK: %[[VAL_2:.*]] = cc.alloca !cc.array x 5> -# CHECK: %[[VAL_3:.*]] = cc.alloca !cc.array -# CHECK: %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr>) -> !cc.ptr> -# CHECK: %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr>) -> !cc.ptr -# CHECK: cc.store %[[VAL_1]], %[[VAL_5]] : !cc.ptr -# CHECK: %[[VAL_6:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_0]] : (!cc.ptr>, i64) -> !cc.stdvec +# CHECK: %[[VAL_3:.*]] = cc.alloca !cc.array +# CHECK: %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr>) -> !cc.ptr> +# CHECK: %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr>) -> !cc.ptr +# CHECK: cc.store %[[VAL_0]], %[[VAL_5]] : !cc.ptr +# CHECK: %[[VAL_6:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_1]] : (!cc.ptr>, i64) -> !cc.stdvec # CHECK: %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr x 5>>, i64) -> !cc.ptr> # CHECK: cc.store %[[VAL_6]], %[[VAL_7]] : !cc.ptr> From da0bab6157271c8841972406c5c91382ee72d615 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 22 Oct 2025 06:42:12 +0000 Subject: [PATCH 7/8] Split interop support into a separate branch Signed-off-by: Thien Nguyen --- python/cudaq/kernel/ast_bridge.py | 35 --------- python/cudaq/kernel/kernel_decorator.py | 67 ++++------------- python/cudaq/kernel/utils.py | 27 +------ python/runtime/cudaq/algorithms/py_run.cpp | 31 +++----- .../cudaq/platform/py_alt_launch_kernel.cpp | 20 ++---- .../tests/interop/quantum_lib/quantum_lib.cpp | 8 --- .../tests/interop/quantum_lib/quantum_lib.h | 10 --- .../test_cpp_quantum_algorithm_module.cpp | 71 ------------------- python/tests/interop/test_interop.py | 58 --------------- 9 files changed, 32 insertions(+), 295 deletions(-) diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py index 3415810819d..96f8ebc4f70 100644 --- a/python/cudaq/kernel/ast_bridge.py +++ b/python/cudaq/kernel/ast_bridge.py @@ -435,24 +435,6 @@ def changeOperandToType(self, ty, operand, allowDemotion=False): sint=operand_width != 1, zint=operand_width == 1).result - if quake.StruqType.isinstance(ty): - if quake.StruqType.isinstance(operand.type): - # Check that the struct types are the same, only the name may differ. - targetMemberType = quake.StruqType.getTypes(ty) - operandMemberType = quake.StruqType.getTypes(operand.type) - if len(targetMemberType) != len(operandMemberType): - self.emitFatalError( - f'cannot convert value of type {operand.type} to the requested type {ty}', - self.currentNode) - for i in range(len(targetMemberType)): - if targetMemberType[i] != operandMemberType[i]: - self.emitFatalError( - f'cannot convert value of type {operand.type} to the requested type {ty}', - self.currentNode) - # It is the same struct, do a cast - structPtr = self.ifNotPointerThenStore(operand) - castedPtr = cc.CastOp(cc.PointerType.get(ty), structPtr).result - return self.ifPointerThenLoad(castedPtr) self.emitFatalError( f'cannot convert value of type {operand.type} to the requested type {ty}', @@ -2430,23 +2412,6 @@ def bodyBuilder(iterVal): # kernel registry correctly for the next conditional check if var.name in globalKernelRegistry: node.func.id = var.name - # Check generic callable objects that may be C++ `qkernel` (with its MLIR code registered) - elif hasattr(var, '__call__'): - # This is a callable object, which could be a C++ kernel - # Get the full module + name key and see if it is registered - modulePath = str(var.__module__) if hasattr( - var, '__module__') else '' - funcName = str(var.__name__) if hasattr( - var, '__name__') else '' - devKey = f"{modulePath}.{funcName}" - if cudaq_runtime.isRegisteredDeviceModule(devKey): - maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( - self.module, devKey) - if maybeKernelName != None: - otherKernel = SymbolTable( - self.module.operation)[maybeKernelName] - processFunctionCall(otherKernel.type, len(node.args)) - return if node.func.id in globalKernelRegistry: # If in `globalKernelRegistry`, it has to be in this Module diff --git a/python/cudaq/kernel/kernel_decorator.py b/python/cudaq/kernel/kernel_decorator.py index f1a685468a3..799117a07bc 100644 --- a/python/cudaq/kernel/kernel_decorator.py +++ b/python/cudaq/kernel/kernel_decorator.py @@ -451,24 +451,6 @@ def __convertStringsToPauli__(self, arg): return arg - def getCallableNames(self, *args): - callableNames = [] - for arg in args: - if isinstance(arg, PyKernelDecorator): - callableNames.append(arg.name) - else: - if hasattr(arg, '__call__') and hasattr(arg, '__module__') and hasattr(arg, '__name__'): - # This is a callable object, likely a C++ kernel - devKey = f"{arg.__module__}.{arg.__name__}" - if cudaq_runtime.isRegisteredDeviceModule(devKey): - maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( - self.module, devKey) - if maybeKernelName != None: - # Remove "__nvqpp__mlirgen__" prefix - maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "") - callableNames.append(maybeKernelName) - return callableNames - def __call__(self, *args): """ Invoke the CUDA-Q kernel. JIT compilation of the kernel AST to MLIR @@ -499,8 +481,7 @@ def __call__(self, *args): mlirType = mlirTypeFromPyType(type(arg), self.module.context, argInstance=arg, - argTypeToCompareTo=self.argTypes[i], - module=self.module) + argTypeToCompareTo=self.argTypes[i]) if self.isCastablePyType(mlirType, self.argTypes[i]): processedArgs.append( @@ -515,39 +496,19 @@ def __call__(self, *args): ) if cc.CallableType.isinstance(mlirType): - if isinstance(arg, PyKernelDecorator): - # Assume this is a PyKernelDecorator - callableNames.append(arg.name) - # It may be that the provided input callable kernel - # is not currently in the ModuleOp. Need to add it - # if that is the case, we have to use the AST - # so that it shares self.module's MLIR Context - symbols = SymbolTable(self.module.operation) - if nvqppPrefix + arg.name not in symbols: - tmpBridge = PyASTBridge(self.capturedDataStorage, - existingModule=self.module, - disableEntryPointTag=True) - tmpBridge.visit(globalAstRegistry[arg.name][0]) - else: - if hasattr(arg, '__call__') and hasattr(arg, '__module__') and hasattr(arg, '__name__'): - # This is a callable object, likely a C++ kernel - devKey = f"{arg.__module__}.{arg.__name__}" - if cudaq_runtime.isRegisteredDeviceModule(devKey): - print("111Found registered device module for callable object:", devKey) - - maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( - self.module, devKey) - if maybeKernelName != None: - otherKernel = SymbolTable( - self.module.operation)[maybeKernelName] - print("Found Other kernel:", otherKernel) - # Remove "__nvqpp__mlirgen__" prefix - maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "") - callableNames.append(maybeKernelName) - else: - emitFatalError( - "Invalid callable argument provided to kernel." - ) + # Assume this is a PyKernelDecorator + callableNames.append(arg.name) + # It may be that the provided input callable kernel + # is not currently in the ModuleOp. Need to add it + # if that is the case, we have to use the AST + # so that it shares self.module's MLIR Context + symbols = SymbolTable(self.module.operation) + if nvqppPrefix + arg.name not in symbols: + tmpBridge = PyASTBridge(self.capturedDataStorage, + existingModule=self.module, + disableEntryPointTag=True) + tmpBridge.visit(globalAstRegistry[arg.name][0]) + # Convert `numpy` arrays to lists if cc.StdvecType.isinstance(mlirType) and hasattr(arg, "tolist"): if arg.ndim != 1: diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py index 8c0b62327e3..efaf213b581 100644 --- a/python/cudaq/kernel/utils.py +++ b/python/cudaq/kernel/utils.py @@ -15,8 +15,8 @@ import types from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime -from cudaq.mlir.dialects import quake, cc, func -from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType, SymbolTable +from cudaq.mlir.dialects import quake, cc +from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType State = cudaq_runtime.State qvector = cudaq_runtime.qvector @@ -442,28 +442,7 @@ def mlirTypeFromPyType(argType, ctx, **kwargs): if 'argInstance' in kwargs: argInstance = kwargs['argInstance'] if isinstance(argInstance, Callable): - if hasattr(argInstance, 'argTypes'): - return cc.CallableType.get(argInstance.argTypes, ctx) - elif hasattr(argInstance, '__call__') and hasattr(argInstance, '__module__') and hasattr(argInstance, '__name__'): - # This is a callable object, likely a C++ kernel - devKey = f"{argInstance.__module__}.{argInstance.__name__}" - if cudaq_runtime.isRegisteredDeviceModule(devKey): - if "module" in kwargs: - module = kwargs['module'] - maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel( - module, devKey) - if maybeKernelName != None: - otherKernel = SymbolTable( - module.operation)[maybeKernelName] - if isinstance(otherKernel, func.FuncOp): - argTypes = [] - for arg in otherKernel.arguments: - argTypes.append(arg.type) - return cc.CallableType.get(argTypes, ctx) - else: - emitFatalError( - f"Registered C++ kernel '{maybeKernelName}' is not of CallableType." - ) + return cc.CallableType.get(argInstance.argTypes, ctx) for name in globalRegisteredTypes.classes: customTy, memberTys = globalRegisteredTypes.getClassAttributes(name) diff --git a/python/runtime/cudaq/algorithms/py_run.cpp b/python/runtime/cudaq/algorithms/py_run.cpp index ef59b14f461..665836234df 100644 --- a/python/runtime/cudaq/algorithms/py_run.cpp +++ b/python/runtime/cudaq/algorithms/py_run.cpp @@ -39,8 +39,7 @@ static std::vector readRunResults(mlir::ModuleOp module, } static std::tuple> + mlir::func::FuncOp, std::string, mlir::func::FuncOp> getKernelLaunchParameters(py::object &kernel, py::args args) { if (!py::hasattr(kernel, "arguments")) throw std::runtime_error( @@ -53,11 +52,6 @@ getKernelLaunchParameters(py::object &kernel, py::args args) { if (py::hasattr(kernel, "compile")) kernel.attr("compile")(); - std::vector callableNames; - if (py::hasattr(kernel, "getCallableNames")) - callableNames = - kernel.attr("getCallableNames")(*args).cast>(); - auto origKernName = kernel.attr("name").cast(); auto kernelName = origKernName + ".run"; if (!py::hasattr(kernel, "module") || kernel.attr("module").is_none()) @@ -84,8 +78,7 @@ getKernelLaunchParameters(py::object &kernel, py::args args) { } auto *argData = toOpaqueArgs(args, kernelMod, kernelName); auto funcOp = getKernelFuncOp(kernelMod, kernelName); - return {kernelName, kernelMod, argData, funcOp, - origKernName, origKern, callableNames}; + return {kernelName, kernelMod, argData, funcOp, origKernName, origKern}; } static details::RunResultSpan @@ -93,7 +86,6 @@ pyRunTheKernel(const std::string &name, const std::string &origName, MlirModule module, mlir::func::FuncOp funcOp, mlir::func::FuncOp origKernel, OpaqueArguments &runtimeArgs, quantum_platform &platform, std::size_t shots_count, - const std::vector &callableNames, std::size_t qpu_id = 0) { auto returnTypes = origKernel.getResultTypes(); if (returnTypes.empty() || returnTypes.size() > 1) @@ -112,13 +104,13 @@ pyRunTheKernel(const std::string &name, const std::string &origName, auto mod = unwrap(module); - auto [rawArgs, size, returnOffset, thunk] = pyAltLaunchKernelBase( - name, module, returnTy, runtimeArgs, callableNames, 0, false); + auto [rawArgs, size, returnOffset, thunk] = + pyAltLaunchKernelBase(name, module, returnTy, runtimeArgs, {}, 0, false); auto results = details::runTheKernel( [&]() mutable { pyLaunchKernel(name, thunk, mod, runtimeArgs, rawArgs, size, - returnOffset, callableNames); + returnOffset, {}); }, platform, name, origName, shots_count, qpu_id); @@ -144,7 +136,7 @@ std::vector pyRun(py::object &kernel, py::args args, if (shots_count == 0) return {}; - auto [name, module, argData, func, origName, origKern, callableNames] = + auto [name, module, argData, func, origName, origKern] = getKernelLaunchParameters(kernel, args); auto mod = unwrap(module); @@ -160,7 +152,7 @@ std::vector pyRun(py::object &kernel, py::args args, } auto span = pyRunTheKernel(name, origName, module, func, origKern, *argData, - platform, shots_count, callableNames); + platform, shots_count); delete argData; auto results = pyReadResults(span, module, func, origKern, shots_count); @@ -195,7 +187,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args, ") exceeds the number of available QPUs (" + std::to_string(numQPUs) + ")"); - auto [name, module, argData, func, origName, origKern, callableNames] = + auto [name, module, argData, func, origName, origKern] = getKernelLaunchParameters(kernel, args); auto mod = unwrap(module); @@ -230,7 +222,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args, QuantumTask wrapped = detail::make_copyable_function( [sp = std::move(spanPromise), ep = std::move(errorPromise), shots_count, qpu_id, argData, name, module, func, origKern, origName, - noise_model = std::move(noise_model), callableNames]() mutable { + noise_model = std::move(noise_model)]() mutable { auto &platform = get_platform(); // Launch the kernel in the appropriate context. @@ -238,9 +230,8 @@ async_run_result pyRunAsync(py::object &kernel, py::args args, platform.set_noise(&noise_model.value()); try { - auto span = - pyRunTheKernel(name, origName, module, func, origKern, *argData, - platform, shots_count, callableNames, qpu_id); + auto span = pyRunTheKernel(name, origName, module, func, origKern, + *argData, platform, shots_count, qpu_id); delete argData; sp.set_value(span); ep.set_value(""); diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp index 5543a54b37d..9db3e9e431f 100644 --- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp +++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp @@ -117,21 +117,8 @@ OpaqueArguments *toOpaqueArgs(py::args &args, MlirModule mod, auto *argData = new cudaq::OpaqueArguments(); args = simplifiedValidateInputArguments(args); setDataLayout(mod); - auto callableArgHandler = [](cudaq::OpaqueArguments &argData, - py::object &arg) { - if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) { - // Just give it some dummy data that will not be used. - // We synthesize away all callables, the block argument - // remains but it is not used, so just give argsCreator - // something, and we'll make sure its cleaned up. - long *ourAllocatedArg = new long(); - argData.emplace_back(ourAllocatedArg, - [](void *ptr) { delete static_cast(ptr); }); - return true; - } - return false; - }; - cudaq::packArgs(*argData, args, kernelFunc, callableArgHandler); + cudaq::packArgs(*argData, args, kernelFunc, + [](OpaqueArguments &, py::object &) { return false; }); return argData; } @@ -170,6 +157,7 @@ ExecutionEngine *jitKernel(const std::string &name, MlirModule module, pm.addPass(cudaq::opt::createGenerateKernelExecution( {.startingArgIdx = startingArgIdx})); pm.addPass(cudaq::opt::createGenerateDeviceCodeLoader({.jitTime = true})); + pm.addPass(cudaq::opt::createReturnToOutputLog()); pm.addPass(cudaq::opt::createLambdaLiftingPass()); pm.addPass(cudaq::opt::createDistributedDeviceCall()); std::string tl = getTransportLayer(); @@ -959,7 +947,7 @@ void bindAltLaunchKernel(py::module &mod, auto callableArgHandler = [](cudaq::OpaqueArguments &argData, py::object &arg) { - if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) { + if (py::hasattr(arg, "module")) { // Just give it some dummy data that will not be used. // We synthesize away all callables, the block argument // remains but it is not used, so just give argsCreator diff --git a/python/tests/interop/quantum_lib/quantum_lib.cpp b/python/tests/interop/quantum_lib/quantum_lib.cpp index ed313b32f88..7c5cbb23054 100644 --- a/python/tests/interop/quantum_lib/quantum_lib.cpp +++ b/python/tests/interop/quantum_lib/quantum_lib.cpp @@ -29,12 +29,4 @@ __qpu__ void qft(cudaq::qview<> qubits, const std::vector &x, __qpu__ void another(cudaq::qview<> qubits, std::size_t i) { x(qubits[i]); } __qpu__ void uccsd(cudaq::qview<> qubits, std::size_t) { h(qubits[0]); } - -__qpu__ void reset_group(patch p) { - for (std::size_t i = 0; i < p.data.size(); i++) - reset(p.data[i]); -} - -__qpu__ void x_group(patch p) { x(p.data); } - } // namespace cudaq diff --git a/python/tests/interop/quantum_lib/quantum_lib.h b/python/tests/interop/quantum_lib/quantum_lib.h index 4b9fa371351..a0655099237 100644 --- a/python/tests/interop/quantum_lib/quantum_lib.h +++ b/python/tests/interop/quantum_lib/quantum_lib.h @@ -9,12 +9,6 @@ #include "cudaq/qis/qubit_qis.h" -// Custom data structure -struct patch { - cudaq::qview<> data; - cudaq::qview<> aux; -}; - namespace cudaq { void entryPoint(const std::function &)> &statePrep); @@ -25,8 +19,4 @@ void another(cudaq::qview<> qubits, std::size_t); void uccsd(cudaq::qview<> qubits, std::size_t); -void reset_group(patch p); - -void x_group(patch p); - } // namespace cudaq diff --git a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp index 9d0b54bfa57..4ea2d2176cc 100644 --- a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp +++ b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp @@ -8,7 +8,6 @@ #include "cudaq.h" #include "cudaq/algorithms/sample.h" -#include "cudaq/qis/qkernel.h" #include "quantum_lib/quantum_lib.h" #include "runtime/interop/PythonCppInterop.h" #include @@ -16,22 +15,6 @@ namespace py = pybind11; -namespace { -static std::unordered_map, std::size_t)>> - g_cppKernels_1; - -static std::unordered_map> - g_cppKernels_2; - -static const bool initKernels = []() { - g_cppKernels_1.insert(std::make_pair("uccsd", cudaq::uccsd)); - g_cppKernels_2.insert(std::make_pair("reset", cudaq::reset_group)); - g_cppKernels_2.insert(std::make_pair("x", cudaq::x_group)); - return true; -}(); -} // namespace - PYBIND11_MODULE(cudaq_test_cpp_algo, m) { m.def("test_cpp_qalgo", [](py::object statePrepIn) { @@ -66,58 +49,4 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) { cudaq::python::addDeviceKernelInterop, std::size_t>( m, "qstd", "uccsd", ""); - - // Convert the C++ kernel registry to Python-accessible kernels - auto interopSubMod = m.def_submodule("_cpp_interop_kernels"); - static std::unordered_map g_py_kernels; - - for (auto &[name, kernel] : g_cppKernels_1) { - const char *qkernelName = cudaq::registry::getLinkableKernelNameOrNull( - cudaq::registry::__cudaq_getLinkableKernelKey(&kernel)); - if (!qkernelName) { - throw std::runtime_error("Could not find registered kernel name for " + - name); - } - - std::string kernelName = qkernelName; - if (kernelName.starts_with("function_")) - kernelName = kernelName.substr(std::string("function_").length()); - - interopSubMod.def( - kernelName.c_str(), [](py::object qview, std::size_t i) {}, - "Auto-generated one-qubit encoding kernel from C++ code"); - cudaq::python::registerDeviceKernel( - interopSubMod.attr("__name__").cast(), kernelName, ""); - g_py_kernels.insert( - std::make_pair(name, interopSubMod.attr(kernelName.c_str()))); - } - - for (auto &[name, kernel] : g_cppKernels_2) { - const char *qkernelName = cudaq::registry::getLinkableKernelNameOrNull( - cudaq::registry::__cudaq_getLinkableKernelKey(&kernel)); - if (!qkernelName) { - throw std::runtime_error("Could not find registered kernel name for " + - name); - } - - std::string kernelName = qkernelName; - if (kernelName.starts_with("function_")) - kernelName = kernelName.substr(std::string("function_").length()); - - interopSubMod.def( - kernelName.c_str(), [](py::object patch) {}, - "Auto-generated one-qubit encoding kernel from C++ code"); - cudaq::python::registerDeviceKernel( - interopSubMod.attr("__name__").cast(), kernelName, ""); - g_py_kernels.insert( - std::make_pair(name, interopSubMod.attr(kernelName.c_str()))); - } - - m.def("get_cpp_kernel", [](const std::string &name) { - auto it = g_py_kernels.find(name); - if (it == g_py_kernels.end()) - throw std::runtime_error("No C++ kernel registered for requested name."); - - return it->second; - }); } diff --git a/python/tests/interop/test_interop.py b/python/tests/interop/test_interop.py index 7e1e7e05fe9..e63588408ac 100644 --- a/python/tests/interop/test_interop.py +++ b/python/tests/interop/test_interop.py @@ -7,8 +7,6 @@ # ============================================================================ # import cudaq, pytest -from typing import Callable -from dataclasses import dataclass cudaq_test_cpp_algo = pytest.importorskip('cudaq_test_cpp_algo') @@ -244,59 +242,3 @@ def entry(): takesCapture(spin) entry.compile() - -def test_cpp_qkernel(): - # Test the `qkernel` provided in C++ via a map-like registry. - # This is provided as a function-like callable. - kernel_from_cpp_registry = cudaq_test_cpp_algo.get_cpp_kernel("uccsd") - - # Use as a capture - @cudaq.kernel - def cpp_qkernel(): - q = cudaq.qvector(4) - kernel_from_cpp_registry(q, 0) - - cpp_qkernel() - - - # Use as a callable argument - @cudaq.kernel - def caller(k: Callable[[cudaq.qview, int], None]): - q = cudaq.qvector(4) - k(q, 0) - - caller(kernel_from_cpp_registry) - - -def test_cpp_custom_struct(): - # Define a struct in Python that matches the C++ struct - @dataclass(slots=True) - class patch: - data: cudaq.qvector - aux: cudaq.qvector - - reset_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("reset") - x_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("x") - - # Use as a capture - @cudaq.kernel - def cpp_qkernel_struct(): - q = cudaq.qvector(4) - r = cudaq.qvector(2) - x(q) - reset_qkernel(patch(q, r)) - - counts = cudaq.sample(cpp_qkernel_struct) - counts.dump() - assert len(counts) == 1 and '000000' in counts - - @cudaq.kernel - def cpp_qkernel_struct_x(): - q = cudaq.qvector(4) - r = cudaq.qvector(2) - x_qkernel(patch(q, r)) - - counts = cudaq.sample(cpp_qkernel_struct_x) - counts.dump() - assert len(counts) == 1 and '111100' in counts - From 2ba98ca1d485d3bad9d4cb052e3079ab00698d67 Mon Sep 17 00:00:00 2001 From: Thien Nguyen Date: Wed, 22 Oct 2025 06:46:53 +0000 Subject: [PATCH 8/8] Support for qkernel interop Signed-off-by: Thien Nguyen Tidy up Signed-off-by: Thien Nguyen --- python/cudaq/kernel/ast_bridge.py | 16 +++- python/cudaq/kernel/kernel_decorator.py | 56 +++++++++---- python/cudaq/kernel/utils.py | 48 ++++++++++-- python/runtime/cudaq/algorithms/py_run.cpp | 31 +++++--- .../cudaq/platform/py_alt_launch_kernel.cpp | 20 ++++- python/runtime/interop/PythonCppInterop.h | 26 +++++++ .../tests/interop/quantum_lib/quantum_lib.cpp | 8 ++ .../tests/interop/quantum_lib/quantum_lib.h | 10 +++ .../test_cpp_quantum_algorithm_module.cpp | 39 ++++++++++ python/tests/interop/test_interop.py | 78 +++++++++++++++++++ 10 files changed, 293 insertions(+), 39 deletions(-) diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py index 96f8ebc4f70..9069b4bd168 100644 --- a/python/cudaq/kernel/ast_bridge.py +++ b/python/cudaq/kernel/ast_bridge.py @@ -31,7 +31,7 @@ from .utils import (Color, globalAstRegistry, globalKernelRegistry, globalRegisteredOperations, globalRegisteredTypes, nvqppPrefix, mlirTypeFromAnnotation, mlirTypeFromPyType, - mlirTypeToPyType, mlirTryCreateStructType) + mlirTypeToPyType, mlirTryCreateStructType, getInteropKernelNameIfFound) State = cudaq_runtime.State @@ -435,7 +435,6 @@ def changeOperandToType(self, ty, operand, allowDemotion=False): sint=operand_width != 1, zint=operand_width == 1).result - self.emitFatalError( f'cannot convert value of type {operand.type} to the requested type {ty}', self.currentNode) @@ -2412,6 +2411,15 @@ def bodyBuilder(iterVal): # kernel registry correctly for the next conditional check if var.name in globalKernelRegistry: node.func.id = var.name + # Check generic callable objects that may be C++ `qkernel` (with its MLIR code registered) + elif hasattr(var, '__call__'): + # Check if this is a registered C++ kernel + maybeKernelName = getInteropKernelNameIfFound(var, self.module) + if maybeKernelName != None: + otherKernel = SymbolTable( + self.module.operation)[maybeKernelName] + processFunctionCall(otherKernel.type, len(node.args)) + return if node.func.id in globalKernelRegistry: # If in `globalKernelRegistry`, it has to be in this Module @@ -2493,8 +2501,10 @@ def bodyBuilder(iterVal): for _, v in annotations.items() ] + unnamed_struct = "__repr__" not in cls.__dict__ + struct_name = node.func.id if not unnamed_struct else "" structTy = mlirTryCreateStructType(structTys, - name=node.func.id, + name=struct_name, context=self.ctx) if structTy is None: self.emitFatalError( diff --git a/python/cudaq/kernel/kernel_decorator.py b/python/cudaq/kernel/kernel_decorator.py index 799117a07bc..ad571d89393 100644 --- a/python/cudaq/kernel/kernel_decorator.py +++ b/python/cudaq/kernel/kernel_decorator.py @@ -21,7 +21,7 @@ from .captured_data import CapturedDataStorage from .utils import (emitFatalError, emitErrorIfInvalidPauli, globalAstRegistry, globalRegisteredTypes, mlirTypeFromPyType, mlirTypeToPyType, - nvqppPrefix) + nvqppPrefix, getInteropKernelNameIfFound) # This file implements the decorator mechanism needed to # JIT compile CUDA-Q kernels. It exposes the cudaq.kernel() @@ -451,6 +451,20 @@ def __convertStringsToPauli__(self, arg): return arg + def getCallableNames(self, *args): + callableNames = [] + for arg in args: + if isinstance(arg, PyKernelDecorator): + callableNames.append(arg.name) + else: + if hasattr(arg, '__call__'): + maybeKernelName = getInteropKernelNameIfFound(arg, self.module) + if maybeKernelName != None: + # Remove "__nvqpp__mlirgen__" prefix when packing the list of callables + maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "") + callableNames.append(maybeKernelName) + return callableNames + def __call__(self, *args): """ Invoke the CUDA-Q kernel. JIT compilation of the kernel AST to MLIR @@ -481,7 +495,8 @@ def __call__(self, *args): mlirType = mlirTypeFromPyType(type(arg), self.module.context, argInstance=arg, - argTypeToCompareTo=self.argTypes[i]) + argTypeToCompareTo=self.argTypes[i], + module=self.module) if self.isCastablePyType(mlirType, self.argTypes[i]): processedArgs.append( @@ -496,19 +511,30 @@ def __call__(self, *args): ) if cc.CallableType.isinstance(mlirType): - # Assume this is a PyKernelDecorator - callableNames.append(arg.name) - # It may be that the provided input callable kernel - # is not currently in the ModuleOp. Need to add it - # if that is the case, we have to use the AST - # so that it shares self.module's MLIR Context - symbols = SymbolTable(self.module.operation) - if nvqppPrefix + arg.name not in symbols: - tmpBridge = PyASTBridge(self.capturedDataStorage, - existingModule=self.module, - disableEntryPointTag=True) - tmpBridge.visit(globalAstRegistry[arg.name][0]) - + if isinstance(arg, PyKernelDecorator): + # Assume this is a PyKernelDecorator + callableNames.append(arg.name) + # It may be that the provided input callable kernel + # is not currently in the ModuleOp. Need to add it + # if that is the case, we have to use the AST + # so that it shares self.module's MLIR Context + symbols = SymbolTable(self.module.operation) + if nvqppPrefix + arg.name not in symbols: + tmpBridge = PyASTBridge(self.capturedDataStorage, + existingModule=self.module, + disableEntryPointTag=True) + tmpBridge.visit(globalAstRegistry[arg.name][0]) + else: + if hasattr(arg, '__call__'): + maybeKernelName = getInteropKernelNameIfFound(arg, self.module) + if maybeKernelName != None: + # Remove "__nvqpp__mlirgen__" prefix + maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "") + callableNames.append(maybeKernelName) + else: + emitFatalError( + "Invalid callable argument provided to kernel." + ) # Convert `numpy` arrays to lists if cc.StdvecType.isinstance(mlirType) and hasattr(arg, "tolist"): if arg.ndim != 1: diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py index efaf213b581..e7f447be516 100644 --- a/python/cudaq/kernel/utils.py +++ b/python/cudaq/kernel/utils.py @@ -15,8 +15,8 @@ import types from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime -from cudaq.mlir.dialects import quake, cc -from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType +from cudaq.mlir.dialects import quake, cc, func +from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType, SymbolTable State = cudaq_runtime.State qvector = cudaq_runtime.qvector @@ -119,8 +119,9 @@ def isQuantumType(ty): if numQuantumMembers != len(mlirEleTypes) or \ any((quake.StruqType.isinstance(t) for t in mlirEleTypes)): return None - return quake.StruqType.getNamed(name, mlirEleTypes, context=context) - + if len(name) > 0: + return quake.StruqType.getNamed(name, mlirEleTypes, context=context) + return quake.StruqType.get(mlirEleTypes, context=context) def mlirTypeFromAnnotation(annotation, ctx, raiseError=False): """ @@ -284,6 +285,7 @@ def emitFatalErrorOverride(msg): f"Adding new fields in data classes is not yet supported. The dataclass must be declared with @dataclass(slots=True) or @dataclasses.dataclass(slots=True)." ) + unnamed_struct = "__repr__" not in pyType.__dict__ if len({ k: v for k, v in pyType.__dict__.items() @@ -293,7 +295,8 @@ def emitFatalErrorOverride(msg): localEmitFatalError( 'struct types with user specified methods are not allowed.') - tupleTy = mlirTryCreateStructType(structTys, name=id) + struct_name = id if not unnamed_struct else "" + tupleTy = mlirTryCreateStructType(structTys, name=struct_name) if tupleTy is None: localEmitFatalError( "Hybrid quantum-classical data types and nested quantum structs are not allowed." @@ -442,7 +445,19 @@ def mlirTypeFromPyType(argType, ctx, **kwargs): if 'argInstance' in kwargs: argInstance = kwargs['argInstance'] if isinstance(argInstance, Callable): - return cc.CallableType.get(argInstance.argTypes, ctx) + if hasattr(argInstance, 'argTypes'): + return cc.CallableType.get(argInstance.argTypes, ctx) + elif "module" in kwargs and hasattr(argInstance, '__call__'): + # This is a callable object, check if it's a C++ `qkernel` + maybeKernelName = getInteropKernelNameIfFound(argInstance, kwargs['module']) + if maybeKernelName != None: + otherKernel = SymbolTable( + kwargs['module'].operation)[maybeKernelName] + if isinstance(otherKernel, func.FuncOp): + argTypes = [] + for arg in otherKernel.arguments: + argTypes.append(arg.type) + return cc.CallableType.get(argTypes, ctx) for name in globalRegisteredTypes.classes: customTy, memberTys = globalRegisteredTypes.getClassAttributes(name) @@ -557,6 +572,27 @@ def mlirTypeToPyType(argType): emitFatalError( f"Cannot infer python type from provided CUDA-Q type ({argType})") +def getInteropKernelNameIfFound(pyFunc, module): + """ + Given a Python function and an MLIR module, check if the function + is registered as an interop kernel. If so, return the kernel name. + Otherwise, return None. + """ + if not callable(pyFunc): + emitFatalError( + f"Provided argument is not a callable function ({pyFunc})" + ) + + modulePath = str(pyFunc.__module__) if hasattr(pyFunc, '__module__') else '' + funcName = str(pyFunc.__name__) if hasattr(pyFunc, '__name__') else '' + # Look up key + devKey = f"{modulePath}.{funcName}" + if cudaq_runtime.isRegisteredDeviceModule(devKey): + maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(module, devKey) + if maybeKernelName != None: + return maybeKernelName + + return None def emitErrorIfInvalidPauli(pauliArg): """ diff --git a/python/runtime/cudaq/algorithms/py_run.cpp b/python/runtime/cudaq/algorithms/py_run.cpp index 665836234df..ef59b14f461 100644 --- a/python/runtime/cudaq/algorithms/py_run.cpp +++ b/python/runtime/cudaq/algorithms/py_run.cpp @@ -39,7 +39,8 @@ static std::vector readRunResults(mlir::ModuleOp module, } static std::tuple + mlir::func::FuncOp, std::string, mlir::func::FuncOp, + std::vector> getKernelLaunchParameters(py::object &kernel, py::args args) { if (!py::hasattr(kernel, "arguments")) throw std::runtime_error( @@ -52,6 +53,11 @@ getKernelLaunchParameters(py::object &kernel, py::args args) { if (py::hasattr(kernel, "compile")) kernel.attr("compile")(); + std::vector callableNames; + if (py::hasattr(kernel, "getCallableNames")) + callableNames = + kernel.attr("getCallableNames")(*args).cast>(); + auto origKernName = kernel.attr("name").cast(); auto kernelName = origKernName + ".run"; if (!py::hasattr(kernel, "module") || kernel.attr("module").is_none()) @@ -78,7 +84,8 @@ getKernelLaunchParameters(py::object &kernel, py::args args) { } auto *argData = toOpaqueArgs(args, kernelMod, kernelName); auto funcOp = getKernelFuncOp(kernelMod, kernelName); - return {kernelName, kernelMod, argData, funcOp, origKernName, origKern}; + return {kernelName, kernelMod, argData, funcOp, + origKernName, origKern, callableNames}; } static details::RunResultSpan @@ -86,6 +93,7 @@ pyRunTheKernel(const std::string &name, const std::string &origName, MlirModule module, mlir::func::FuncOp funcOp, mlir::func::FuncOp origKernel, OpaqueArguments &runtimeArgs, quantum_platform &platform, std::size_t shots_count, + const std::vector &callableNames, std::size_t qpu_id = 0) { auto returnTypes = origKernel.getResultTypes(); if (returnTypes.empty() || returnTypes.size() > 1) @@ -104,13 +112,13 @@ pyRunTheKernel(const std::string &name, const std::string &origName, auto mod = unwrap(module); - auto [rawArgs, size, returnOffset, thunk] = - pyAltLaunchKernelBase(name, module, returnTy, runtimeArgs, {}, 0, false); + auto [rawArgs, size, returnOffset, thunk] = pyAltLaunchKernelBase( + name, module, returnTy, runtimeArgs, callableNames, 0, false); auto results = details::runTheKernel( [&]() mutable { pyLaunchKernel(name, thunk, mod, runtimeArgs, rawArgs, size, - returnOffset, {}); + returnOffset, callableNames); }, platform, name, origName, shots_count, qpu_id); @@ -136,7 +144,7 @@ std::vector pyRun(py::object &kernel, py::args args, if (shots_count == 0) return {}; - auto [name, module, argData, func, origName, origKern] = + auto [name, module, argData, func, origName, origKern, callableNames] = getKernelLaunchParameters(kernel, args); auto mod = unwrap(module); @@ -152,7 +160,7 @@ std::vector pyRun(py::object &kernel, py::args args, } auto span = pyRunTheKernel(name, origName, module, func, origKern, *argData, - platform, shots_count); + platform, shots_count, callableNames); delete argData; auto results = pyReadResults(span, module, func, origKern, shots_count); @@ -187,7 +195,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args, ") exceeds the number of available QPUs (" + std::to_string(numQPUs) + ")"); - auto [name, module, argData, func, origName, origKern] = + auto [name, module, argData, func, origName, origKern, callableNames] = getKernelLaunchParameters(kernel, args); auto mod = unwrap(module); @@ -222,7 +230,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args, QuantumTask wrapped = detail::make_copyable_function( [sp = std::move(spanPromise), ep = std::move(errorPromise), shots_count, qpu_id, argData, name, module, func, origKern, origName, - noise_model = std::move(noise_model)]() mutable { + noise_model = std::move(noise_model), callableNames]() mutable { auto &platform = get_platform(); // Launch the kernel in the appropriate context. @@ -230,8 +238,9 @@ async_run_result pyRunAsync(py::object &kernel, py::args args, platform.set_noise(&noise_model.value()); try { - auto span = pyRunTheKernel(name, origName, module, func, origKern, - *argData, platform, shots_count, qpu_id); + auto span = + pyRunTheKernel(name, origName, module, func, origKern, *argData, + platform, shots_count, callableNames, qpu_id); delete argData; sp.set_value(span); ep.set_value(""); diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp index 9db3e9e431f..5543a54b37d 100644 --- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp +++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp @@ -117,8 +117,21 @@ OpaqueArguments *toOpaqueArgs(py::args &args, MlirModule mod, auto *argData = new cudaq::OpaqueArguments(); args = simplifiedValidateInputArguments(args); setDataLayout(mod); - cudaq::packArgs(*argData, args, kernelFunc, - [](OpaqueArguments &, py::object &) { return false; }); + auto callableArgHandler = [](cudaq::OpaqueArguments &argData, + py::object &arg) { + if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) { + // Just give it some dummy data that will not be used. + // We synthesize away all callables, the block argument + // remains but it is not used, so just give argsCreator + // something, and we'll make sure its cleaned up. + long *ourAllocatedArg = new long(); + argData.emplace_back(ourAllocatedArg, + [](void *ptr) { delete static_cast(ptr); }); + return true; + } + return false; + }; + cudaq::packArgs(*argData, args, kernelFunc, callableArgHandler); return argData; } @@ -157,7 +170,6 @@ ExecutionEngine *jitKernel(const std::string &name, MlirModule module, pm.addPass(cudaq::opt::createGenerateKernelExecution( {.startingArgIdx = startingArgIdx})); pm.addPass(cudaq::opt::createGenerateDeviceCodeLoader({.jitTime = true})); - pm.addPass(cudaq::opt::createReturnToOutputLog()); pm.addPass(cudaq::opt::createLambdaLiftingPass()); pm.addPass(cudaq::opt::createDistributedDeviceCall()); std::string tl = getTransportLayer(); @@ -947,7 +959,7 @@ void bindAltLaunchKernel(py::module &mod, auto callableArgHandler = [](cudaq::OpaqueArguments &argData, py::object &arg) { - if (py::hasattr(arg, "module")) { + if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) { // Just give it some dummy data that will not be used. // We synthesize away all callables, the block argument // remains but it is not used, so just give argsCreator diff --git a/python/runtime/interop/PythonCppInterop.h b/python/runtime/interop/PythonCppInterop.h index 9b39aada636..c74a8ec2872 100644 --- a/python/runtime/interop/PythonCppInterop.h +++ b/python/runtime/interop/PythonCppInterop.h @@ -7,6 +7,8 @@ ******************************************************************************/ #pragma once +#include "cudaq/qis/qkernel.h" +#include "cudaq/utils/registry.h" #include namespace py = pybind11; @@ -166,4 +168,28 @@ void addDeviceKernelInterop(py::module_ &m, const std::string &modName, kernelName, mangledArgs); return; } + +// Specialization for qkernel +template +py::object convertQkernel(py::module_ &m, cudaq::qkernel &qkernel, + const std::string &docstring = "") { + const char *qkernelName = cudaq::registry::getLinkableKernelNameOrNull( + cudaq::registry::__cudaq_getLinkableKernelKey(&qkernel)); + if (!qkernelName) + throw std::runtime_error( + "Invalid `qkernel` passed, could not find registered kernel."); + std::string kernelName = qkernelName; + // Rremove "function_" prefix if exists + if (kernelName.starts_with("function_")) + kernelName = kernelName.substr(std::string("function_").length()); + const std::string docStr = + docstring.empty() + ? "Auto-generated kernel from C++ " + kernelName + " qkernel." + : docstring; + m.def( + kernelName.c_str(), [](Args...) {}, docStr.c_str()); + cudaq::python::registerDeviceKernel(m.attr("__name__").cast(), + kernelName, ""); + return m.attr(kernelName.c_str()); +} } // namespace cudaq::python diff --git a/python/tests/interop/quantum_lib/quantum_lib.cpp b/python/tests/interop/quantum_lib/quantum_lib.cpp index 7c5cbb23054..ed313b32f88 100644 --- a/python/tests/interop/quantum_lib/quantum_lib.cpp +++ b/python/tests/interop/quantum_lib/quantum_lib.cpp @@ -29,4 +29,12 @@ __qpu__ void qft(cudaq::qview<> qubits, const std::vector &x, __qpu__ void another(cudaq::qview<> qubits, std::size_t i) { x(qubits[i]); } __qpu__ void uccsd(cudaq::qview<> qubits, std::size_t) { h(qubits[0]); } + +__qpu__ void reset_group(patch p) { + for (std::size_t i = 0; i < p.data.size(); i++) + reset(p.data[i]); +} + +__qpu__ void x_group(patch p) { x(p.data); } + } // namespace cudaq diff --git a/python/tests/interop/quantum_lib/quantum_lib.h b/python/tests/interop/quantum_lib/quantum_lib.h index a0655099237..4b9fa371351 100644 --- a/python/tests/interop/quantum_lib/quantum_lib.h +++ b/python/tests/interop/quantum_lib/quantum_lib.h @@ -9,6 +9,12 @@ #include "cudaq/qis/qubit_qis.h" +// Custom data structure +struct patch { + cudaq::qview<> data; + cudaq::qview<> aux; +}; + namespace cudaq { void entryPoint(const std::function &)> &statePrep); @@ -19,4 +25,8 @@ void another(cudaq::qview<> qubits, std::size_t); void uccsd(cudaq::qview<> qubits, std::size_t); +void reset_group(patch p); + +void x_group(patch p); + } // namespace cudaq diff --git a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp index 4ea2d2176cc..92f1382f2a8 100644 --- a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp +++ b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp @@ -8,6 +8,7 @@ #include "cudaq.h" #include "cudaq/algorithms/sample.h" +#include "cudaq/qis/qkernel.h" #include "quantum_lib/quantum_lib.h" #include "runtime/interop/PythonCppInterop.h" #include @@ -15,6 +16,22 @@ namespace py = pybind11; +namespace { +static std::unordered_map, std::size_t)>> + g_cppKernels_1; + +static std::unordered_map> + g_cppKernels_2; + +static const bool initKernels = []() { + g_cppKernels_1.insert(std::make_pair("uccsd", cudaq::uccsd)); + g_cppKernels_2.insert(std::make_pair("reset", cudaq::reset_group)); + g_cppKernels_2.insert(std::make_pair("x", cudaq::x_group)); + return true; +}(); +} // namespace + PYBIND11_MODULE(cudaq_test_cpp_algo, m) { m.def("test_cpp_qalgo", [](py::object statePrepIn) { @@ -49,4 +66,26 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) { cudaq::python::addDeviceKernelInterop, std::size_t>( m, "qstd", "uccsd", ""); + + // Convert the C++ kernel registry to Python-accessible kernels + auto interopSubMod = m.def_submodule("_cpp_interop_kernels"); + static std::unordered_map g_py_kernels; + + for (auto &[name, kernel] : g_cppKernels_1) { + g_py_kernels.insert(std::make_pair( + name, cudaq::python::convertQkernel(interopSubMod, kernel))); + } + + for (auto &[name, kernel] : g_cppKernels_2) { + g_py_kernels.insert(std::make_pair( + name, cudaq::python::convertQkernel(interopSubMod, kernel))); + } + + m.def("get_cpp_kernel", [](const std::string &name) { + auto it = g_py_kernels.find(name); + if (it == g_py_kernels.end()) + throw std::runtime_error("No C++ kernel registered for requested name."); + + return it->second; + }); } diff --git a/python/tests/interop/test_interop.py b/python/tests/interop/test_interop.py index e63588408ac..4324e79c02c 100644 --- a/python/tests/interop/test_interop.py +++ b/python/tests/interop/test_interop.py @@ -7,6 +7,8 @@ # ============================================================================ # import cudaq, pytest +from typing import Callable +from dataclasses import dataclass cudaq_test_cpp_algo = pytest.importorskip('cudaq_test_cpp_algo') @@ -242,3 +244,79 @@ def entry(): takesCapture(spin) entry.compile() + + +def test_cpp_qkernel(): + # Test the `qkernel` provided in C++ via a map-like registry. + # This is provided as a function-like callable. + kernel_from_cpp_registry = cudaq_test_cpp_algo.get_cpp_kernel("uccsd") + + # Use as a capture + @cudaq.kernel + def cpp_qkernel(): + q = cudaq.qvector(4) + kernel_from_cpp_registry(q, 0) + + cpp_qkernel() + + # Use as a callable argument + @cudaq.kernel + def caller(k: Callable[[cudaq.qview, int], None]): + q = cudaq.qvector(4) + k(q, 0) + + caller(kernel_from_cpp_registry) + + +def test_cpp_custom_struct(): + # Define a struct in Python that matches the C++ struct + # Note: use `repr=False` to annotate that this is an unnamed struct. + # This will maintain compatibility with C++ structs that do not have + # a name. + @dataclass(slots=True, repr=False) + class patch: + data: cudaq.qvector + aux: cudaq.qvector + + reset_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("reset") + x_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("x") + + # Use as a capture + @cudaq.kernel + def cpp_qkernel_struct(): + q = cudaq.qvector(4) + r = cudaq.qvector(2) + x(q) + reset_qkernel(patch(q, r)) + + counts = cudaq.sample(cpp_qkernel_struct) + counts.dump() + assert len(counts) == 1 and '000000' in counts + + @cudaq.kernel + def cpp_qkernel_struct_x(): + q = cudaq.qvector(4) + r = cudaq.qvector(2) + x_qkernel(patch(q, r)) + + counts = cudaq.sample(cpp_qkernel_struct_x) + counts.dump() + assert len(counts) == 1 and '111100' in counts + + # Callable + @cudaq.kernel + def cpp_qkernel_struct_callable(k: Callable[[patch], None]): + q = cudaq.qvector(4) + r = cudaq.qvector(2) + for i in range(4): + if i % 2 == 0: + x(q[i]) + k(patch(q, r)) + + counts = cudaq.sample(cpp_qkernel_struct_callable, reset_qkernel) + counts.dump() + assert len(counts) == 1 and '000000' in counts + + counts = cudaq.sample(cpp_qkernel_struct_callable, x_qkernel) + counts.dump() + assert len(counts) == 1 and '010100' in counts