From 1542d442dc0975157db0c32a3fd40a58492164e9 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 15 Oct 2025 02:31:55 +0000
Subject: [PATCH 1/8] Support i1 and callable

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 python/cudaq/kernel/ast_bridge.py             | 167 ++++++++++++++++--
 python/cudaq/kernel/kernel_decorator.py       |  67 +++++--
 python/cudaq/kernel/utils.py                  |  40 ++++-
 python/runtime/cudaq/algorithms/py_run.cpp    |  42 +++--
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  22 ++-
 python/tests/kernel/test_return_vectors.py    | 132 ++++++++++++++
 6 files changed, 419 insertions(+), 51 deletions(-)
 create mode 100644 python/tests/kernel/test_return_vectors.py
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index aceab7ec75a..39c64b2274f 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -434,6 +434,26 @@ def changeOperandToType(self, ty, operand, allowDemotion=False):
                                  operand,
                                  sint=operand_width != 1,
                                  zint=operand_width == 1).result
+        
+        if quake.StruqType.isinstance(ty):
+            if quake.StruqType.isinstance(operand.type):
+                # Check that the struct types are the same, only the name may differ.
+                targetMemberType = quake.StruqType.getTypes(ty)
+                operandMemberType = quake.StruqType.getTypes(operand.type)
+                if len(targetMemberType) != len(operandMemberType):
+                    self.emitFatalError(
+                        f'cannot convert value of type {operand.type} to the requested type {ty}',
+                        self.currentNode)
+                for i in range(len(targetMemberType)):
+                    if targetMemberType[i] != operandMemberType[i]:
+                        self.emitFatalError(
+                            f'cannot convert value of type {operand.type} to the requested type {ty}',
+                            self.currentNode)
+                # It is the same struct, do a cast
+                structPtr = self.ifNotPointerThenStore(operand)
+                castedPtr =  cc.CastOp(cc.PointerType.get(ty), structPtr).result
+                return self.ifPointerThenLoad(castedPtr)
+        
         self.emitFatalError(
             f'cannot convert value of type {operand.type} to the requested type {ty}',
             self.currentNode)
@@ -578,6 +598,7 @@ def ifNotPointerThenStore(self, value):
         if not cc.PointerType.isinstance(value.type):
             slot = cc.AllocaOp(cc.PointerType.get(value.type),
                                TypeAttr.get(value.type)).result
+            assert cc.PointerType.get(value.type) == slot.type
             cc.StoreOp(value, slot)
             return slot
         return value
@@ -585,20 +606,32 @@ def ifNotPointerThenStore(self, value):
     def __createStdvecWithKnownValues(self, size, listElementValues):
         # Turn this List into a StdVec<T>
         arrSize = self.getConstantInt(size)
-        arrTy = cc.ArrayType.get(listElementValues[0].type)
+        elemTy = listElementValues[0].type
+        # If this is an `i1`, turns it into an `i8` array.
+        isBool = elemTy == self.getIntegerType(1)
+        if isBool:
+            elemTy = self.getIntegerType(8)
+
+        arrTy = cc.ArrayType.get(elemTy)
         alloca = cc.AllocaOp(cc.PointerType.get(arrTy),
-                             TypeAttr.get(listElementValues[0].type),
+                             TypeAttr.get(elemTy),
                              seqSize=arrSize).result
 
         for i, v in enumerate(listElementValues):
             eleAddr = cc.ComputePtrOp(
-                cc.PointerType.get(listElementValues[0].type), alloca,
+                cc.PointerType.get(elemTy), alloca,
                 [self.getConstantInt(i)],
                 DenseI32ArrayAttr.get([kDynamicPtrIndex],
                                       context=self.ctx)).result
+            if isBool:
+                # Cast the list value before assigning
+                v = self.changeOperandToType(self.getIntegerType(8), v)
             cc.StoreOp(v, eleAddr)
 
-        vecTy = listElementValues[0].type
+        # Create the `StdVec<T>` from the alloca
+        # We still use `i1` as the vector element type if the
+        # original list was of booleans.
+        vecTy = elemTy if not isBool else self.getIntegerType(1)
         if cc.PointerType.isinstance(vecTy):
             vecTy = cc.PointerType.getElementType(vecTy)
 
@@ -655,6 +688,10 @@ def __copyVectorAndCastElements(self,
         if (sourceEleType == targetEleType):
             return sourcePtr
 
+        isSourceBool = sourceEleType == self.getIntegerType(1)
+        if isSourceBool:
+            sourceEleType = self.getIntegerType(8)
+
         sourceArrType = cc.ArrayType.get(sourceEleType)
         sourceElePtrTy = cc.PointerType.get(sourceEleType)
         sourceArrElePtrTy = cc.PointerType.get(sourceArrType)
@@ -662,10 +699,16 @@ def __copyVectorAndCastElements(self,
         sourceDataPtr = cc.StdvecDataOp(sourceArrElePtrTy, sourceValue).result
         sourceSize = cc.StdvecSizeOp(self.getIntegerType(), sourceValue).result
 
+        isTargetBool = targetEleType == self.getIntegerType(1)
+        # Vector type reflects the true type, including `i1`
+        targetVecTy = cc.StdvecType.get(targetEleType)
+
+        if isTargetBool:
+            targetEleType = self.getIntegerType(8)
+
         targetElePtrType = cc.PointerType.get(targetEleType)
         targetTy = cc.ArrayType.get(targetEleType)
         targetArrElePtrTy = cc.PointerType.get(targetTy)
-        targetVecTy = cc.StdvecType.get(targetEleType)
         targetPtr = cc.AllocaOp(targetArrElePtrTy,
                                 TypeAttr.get(targetEleType),
                                 seqSize=sourceSize).result
@@ -681,6 +724,7 @@ def bodyBuilder(iterVar):
                                                  allowDemotion=allowDemotion)
             targetEleAddr = cc.ComputePtrOp(targetElePtrType, targetPtr,
                                             [iterVar], rawIndex).result
+            assert cc.PointerType.get(targetEleType) == targetEleAddr.type
             cc.StoreOp(castedEle, targetEleAddr)
 
         self.createInvariantForLoop(sourceSize, bodyBuilder)
@@ -777,15 +821,26 @@ def __load_vector_element(self, vector, index):
             MLIR Value containing the loaded element
         """
         if cc.StdvecType.isinstance(vector.type):
+            elem_ty = cc.StdvecType.getElementType(vector.type)
+            is_bool = elem_ty == self.getIntegerType(1)
+            # std::vector<bool> is a special case in C++ where each element
+            # is stored as a single bit, but the underlying array is actually
+            # an array of `i8` values.
+            if is_bool:
+                # `i1` elements are stored as `i8` in the underlying array.
+                elem_ty = self.getIntegerType(8)
             data_ptr = cc.StdvecDataOp(
                 cc.PointerType.get(
-                    cc.ArrayType.get(cc.StdvecType.getElementType(
-                        vector.type))), vector).result
-            return cc.LoadOp(
+                    cc.ArrayType.get(elem_ty)), vector).result
+            load_val = cc.LoadOp(
                 cc.ComputePtrOp(
-                    cc.PointerType.get(cc.StdvecType.getElementType(
-                        vector.type)), data_ptr, [index],
+                    cc.PointerType.get(elem_ty), data_ptr, [index],
                     DenseI32ArrayAttr.get([kDynamicPtrIndex]))).result
+            if is_bool:
+                # Cast back to `i1` if the original vector element type was `i1`.
+                load_val = self.changeOperandToType(self.getIntegerType(1),
+                                                   load_val)
+            return load_val
         return cc.LoadOp(
             cc.ComputePtrOp(
                 cc.PointerType.get(
@@ -1405,6 +1460,7 @@ def process_assignment(target, value):
                     # We should allocate and store
                     alloca = cc.AllocaOp(cc.PointerType.get(value.type),
                                          TypeAttr.get(value.type)).result
+                    assert cc.PointerType.get(value.type) == alloca.type
                     cc.StoreOp(value, alloca)
                     return target, alloca
 
@@ -1438,6 +1494,8 @@ def process_assignment(target, value):
                 # Visit the value being assigned
                 self.visit(node.value)
                 valueToStore = self.popValue()
+                # Cast if necessary
+                valueToStore = self.changeOperandToType(ptrEleType, valueToStore)
                 # Store the value
                 cc.StoreOp(valueToStore, ptrVal)
                 return target.value, None
@@ -1460,6 +1518,8 @@ def process_assignment(target, value):
                 # Visit the value being assigned
                 self.visit(node.value)
                 valueToStore = self.popValue()
+                # Cast if necessary
+                valueToStore = self.changeOperandToType(cc.PointerType.getElementType(ptrVal.type), valueToStore)
                 # Store the value
                 cc.StoreOp(valueToStore, ptrVal)
                 return target.value, None
@@ -1771,6 +1831,26 @@ def processFunctionCall(fType, nrValsToPop):
                 func.CallOp(otherKernel, values)
             else:
                 result = func.CallOp(otherKernel, values).result
+                # Copy to stack if necessary
+                if cc.StdvecType.isinstance(result.type):
+                    elemTy = cc.StdvecType.getElementType(result.type)
+                    if elemTy == self.getIntegerType(1):
+                        elemTy = self.getIntegerType(8)
+                    data = cc.StdvecDataOp(cc.PointerType.get(elemTy), result).result
+                    i64Ty = self.getIntegerType(64)
+                    length = cc.StdvecSizeOp(i64Ty, result).result
+                    elemSize = cc.SizeOfOp(i64Ty, TypeAttr.get(elemTy)).result
+                    buffer = cc.AllocaOp(cc.PointerType.get(cc.ArrayType.get(elemTy)), TypeAttr.get(elemTy), seqSize=length).result
+                    i8PtrTy = cc.PointerType.get(self.getIntegerType(8))
+                    cbuffer = cc.CastOp(i8PtrTy, buffer).result
+                    cdata = cc.CastOp(i8PtrTy, data).result
+                    symName = '__nvqpp_vectorCopyToStack'
+                    load_intrinsic(self.module, symName)
+                    sizeInBytes = arith.MulIOp(length, elemSize).result
+                    func.CallOp([], symName, [cbuffer, cdata, sizeInBytes])
+                    # Replace result with the stack buffer-backed vector
+                    result = cc.StdvecInitOp(result.type, buffer, length=length).result                
+                
                 self.pushValue(result)
 
         def checkControlAndTargetTypes(controls, targets):
@@ -2474,7 +2554,18 @@ def bodyBuilder(iterVal):
                     cc.StoreOp(ctorArgs[i], eleAddr)
                 self.pushValue(stackSlot)
                 return
-
+            # Check generic callable objects that may be C++ kernels
+            elif hasattr(var, '__call__') and hasattr(var, '__module__') and hasattr(var, '__name__'):
+                    # This is a callable object, likely a C++ kernel
+                    devKey = f"{var.__module__}.{var.__name__}"
+                    if cudaq_runtime.isRegisteredDeviceModule(devKey):
+                        maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
+                            self.module, devKey)
+                        if maybeKernelName != None:
+                            otherKernel = SymbolTable(
+                                self.module.operation)[maybeKernelName]
+                            processFunctionCall(otherKernel.type, len(node.args))
+                            return
             else:
                 self.emitFatalError(
                     "unhandled function call - {}, known kernels are {}".format(
@@ -2915,6 +3006,30 @@ def bodyBuilder(iterVal):
                         quake.ComputeActionOp(compute, action)
                         return
 
+                    if node.func.attr == 'to_integer':
+                        boolVec = self.popValue()
+                        boolVec = self.ifPointerThenLoad(boolVec)
+                        if not cc.StdvecType.isinstance(boolVec.type):
+                            self.emitFatalError(
+                                "to_integer expects a vector of booleans. Got type {}".format(
+                                    boolVec.type),
+                                node)
+                        elemTy = cc.StdvecType.getElementType(boolVec.type)
+                        if elemTy != self.getIntegerType(1):
+                            self.emitFatalError(
+                                "to_integer expects a vector of booleans. Got type {}".format(
+                                    boolVec.type),
+                                node)
+                        cudaqConvertToInteger = "__nvqpp_cudaqConvertToInteger"
+                        # Load the intrinsic
+                        load_intrinsic(self.module, cudaqConvertToInteger)
+                        # Signature:
+                        # `func.func private @__nvqpp_cudaqConvertToInteger(%arg : !cc.stdvec<i1>) -> i64`
+                        resultTy = self.getIntegerType(64)
+                        result = func.CallOp([resultTy], cudaqConvertToInteger, [boolVec]).result
+                        self.pushValue(result)
+                        return
+
                     self.emitFatalError(
                         f'Invalid function or class type requested from the cudaq module ({node.func.attr})',
                         node)
@@ -3453,6 +3568,11 @@ def get_item_type(pyval):
         listElemTy = get_item_type(node.elt)
         if listElemTy is None:
             return
+        
+        resultVecTy = cc.StdvecType.get(listElemTy)
+        isBool = listElemTy == self.getIntegerType(1)
+        if isBool:
+            listElemTy = self.getIntegerType(8)
         listTy = cc.ArrayType.get(listElemTy)
         listValue = cc.AllocaOp(cc.PointerType.get(listTy),
                                 TypeAttr.get(listElemTy),
@@ -3482,12 +3602,15 @@ def bodyBuilder(iterVar):
             listValueAddr = cc.ComputePtrOp(
                 cc.PointerType.get(listElemTy), listValue, [iterVar],
                 DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx))
+            
+            if isBool:
+                result = self.changeOperandToType(self.getIntegerType(8), result)
             cc.StoreOp(result, listValueAddr)
             self.symbolTable.popScope()
 
         self.createInvariantForLoop(iterableSize, bodyBuilder)
         self.pushValue(
-            cc.StdvecInitOp(cc.StdvecType.get(listElemTy),
+            cc.StdvecInitOp(resultVecTy,
                             listValue,
                             length=iterableSize).result)
         return
@@ -3679,6 +3802,9 @@ def fix_negative_idx(idx, get_size):
                                    upper=upperVal).result)
             elif cc.StdvecType.isinstance(var.type):
                 eleTy = cc.StdvecType.getElementType(var.type)
+                isBool = eleTy == self.getIntegerType(1)
+                if isBool:
+                    eleTy = self.getIntegerType(8)
                 ptrTy = cc.PointerType.get(eleTy)
                 arrTy = cc.ArrayType.get(eleTy)
                 ptrArrTy = cc.PointerType.get(arrTy)
@@ -3722,6 +3848,9 @@ def fix_negative_idx(idx, get_size):
         if cc.StdvecType.isinstance(var.type):
             idx = fix_negative_idx(idx, lambda: get_size(var))
             eleTy = cc.StdvecType.getElementType(var.type)
+            isBool = eleTy == self.getIntegerType(1)
+            if isBool:
+                eleTy = self.getIntegerType(8)
             elePtrTy = cc.PointerType.get(eleTy)
             arrTy = cc.ArrayType.get(eleTy)
             ptrArrTy = cc.PointerType.get(arrTy)
@@ -3733,7 +3862,10 @@ def fix_negative_idx(idx, get_size):
             if self.subscriptPushPointerValue:
                 self.pushValue(eleAddr)
                 return
-            self.pushValue(cc.LoadOp(eleAddr).result)
+            val = cc.LoadOp(eleAddr).result
+            if isBool:
+                val = self.changeOperandToType(self.getIntegerType(1), val)
+            self.pushValue(val)
             return
 
         if cc.PointerType.isinstance(var.type):
@@ -3960,7 +4092,9 @@ def functor(iter, idx):
                 iterEleTy = cc.StdvecType.getElementType(iterable.type)
                 totalSize = cc.StdvecSizeOp(self.getIntegerType(),
                                             iterable).result
-
+                isBool = iterEleTy == self.getIntegerType(1)
+                if isBool:
+                    iterEleTy = self.getIntegerType(8)
                 def functor(iter, idxVal):
                     elePtrTy = cc.PointerType.get(iterEleTy)
                     arrTy = cc.ArrayType.get(iterEleTy)
@@ -3970,7 +4104,10 @@ def functor(iter, idxVal):
                         elePtrTy, vecPtr, [idxVal],
                         DenseI32ArrayAttr.get([kDynamicPtrIndex],
                                               context=self.ctx)).result
-                    return cc.LoadOp(eleAddr).result
+                    result = cc.LoadOp(eleAddr).result
+                    if isBool:
+                        result = self.changeOperandToType(self.getIntegerType(1), result)
+                    return result
 
                 extractFunctor = functor
 
diff --git a/python/cudaq/kernel/kernel_decorator.py b/python/cudaq/kernel/kernel_decorator.py
index 799117a07bc..f1a685468a3 100644
--- a/python/cudaq/kernel/kernel_decorator.py
+++ b/python/cudaq/kernel/kernel_decorator.py
@@ -451,6 +451,24 @@ def __convertStringsToPauli__(self, arg):
 
         return arg
 
+    def getCallableNames(self, *args):
+        callableNames = []
+        for arg in args:
+            if isinstance(arg, PyKernelDecorator):
+                callableNames.append(arg.name)
+            else:
+                if hasattr(arg, '__call__') and hasattr(arg, '__module__') and hasattr(arg, '__name__'):
+                    # This is a callable object, likely a C++ kernel
+                    devKey = f"{arg.__module__}.{arg.__name__}"
+                    if cudaq_runtime.isRegisteredDeviceModule(devKey):                        
+                        maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
+                            self.module, devKey)
+                        if maybeKernelName != None:
+                            # Remove "__nvqpp__mlirgen__" prefix
+                            maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "")
+                            callableNames.append(maybeKernelName)
+        return callableNames
+    
     def __call__(self, *args):
         """
         Invoke the CUDA-Q kernel. JIT compilation of the kernel AST to MLIR 
@@ -481,7 +499,8 @@ def __call__(self, *args):
             mlirType = mlirTypeFromPyType(type(arg),
                                           self.module.context,
                                           argInstance=arg,
-                                          argTypeToCompareTo=self.argTypes[i])
+                                          argTypeToCompareTo=self.argTypes[i],
+                                          module=self.module)
 
             if self.isCastablePyType(mlirType, self.argTypes[i]):
                 processedArgs.append(
@@ -496,19 +515,39 @@ def __call__(self, *args):
                 )
 
             if cc.CallableType.isinstance(mlirType):
-                # Assume this is a PyKernelDecorator
-                callableNames.append(arg.name)
-                # It may be that the provided input callable kernel
-                # is not currently in the ModuleOp. Need to add it
-                # if that is the case, we have to use the AST
-                # so that it shares self.module's MLIR Context
-                symbols = SymbolTable(self.module.operation)
-                if nvqppPrefix + arg.name not in symbols:
-                    tmpBridge = PyASTBridge(self.capturedDataStorage,
-                                            existingModule=self.module,
-                                            disableEntryPointTag=True)
-                    tmpBridge.visit(globalAstRegistry[arg.name][0])
-
+                if isinstance(arg, PyKernelDecorator):
+                    # Assume this is a PyKernelDecorator
+                    callableNames.append(arg.name)
+                    # It may be that the provided input callable kernel
+                    # is not currently in the ModuleOp. Need to add it
+                    # if that is the case, we have to use the AST
+                    # so that it shares self.module's MLIR Context
+                    symbols = SymbolTable(self.module.operation)
+                    if nvqppPrefix + arg.name not in symbols:
+                        tmpBridge = PyASTBridge(self.capturedDataStorage,
+                                                existingModule=self.module,
+                                                disableEntryPointTag=True)
+                        tmpBridge.visit(globalAstRegistry[arg.name][0])
+                else:
+                    if hasattr(arg, '__call__') and hasattr(arg, '__module__') and hasattr(arg, '__name__'):
+                        # This is a callable object, likely a C++ kernel
+                        devKey = f"{arg.__module__}.{arg.__name__}"
+                        if cudaq_runtime.isRegisteredDeviceModule(devKey):
+                            print("111Found registered device module for callable object:", devKey)
+                            
+                            maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
+                                self.module, devKey)
+                            if maybeKernelName != None:
+                                otherKernel = SymbolTable(
+                                    self.module.operation)[maybeKernelName]
+                                print("Found Other kernel:", otherKernel)
+                                # Remove "__nvqpp__mlirgen__" prefix
+                                maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "")
+                                callableNames.append(maybeKernelName)
+                    else:
+                        emitFatalError(
+                            "Invalid callable argument provided to kernel."
+                        )
             # Convert `numpy` arrays to lists
             if cc.StdvecType.isinstance(mlirType) and hasattr(arg, "tolist"):
                 if arg.ndim != 1:
diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py
index efaf213b581..c95c9a42cdf 100644
--- a/python/cudaq/kernel/utils.py
+++ b/python/cudaq/kernel/utils.py
@@ -15,8 +15,8 @@
 import types
 
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
-from cudaq.mlir.dialects import quake, cc
-from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType
+from cudaq.mlir.dialects import quake, cc, func
+from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType, SymbolTable
 
 State = cudaq_runtime.State
 qvector = cudaq_runtime.qvector
@@ -442,7 +442,41 @@ def mlirTypeFromPyType(argType, ctx, **kwargs):
     if 'argInstance' in kwargs:
         argInstance = kwargs['argInstance']
         if isinstance(argInstance, Callable):
-            return cc.CallableType.get(argInstance.argTypes, ctx)
+            if hasattr(argInstance, 'argTypes'):
+                print("Found Callable with argTypes:", argInstance.argTypes)
+                return cc.CallableType.get(argInstance.argTypes, ctx)
+            elif hasattr(argInstance, '__call__') and hasattr(argInstance, '__module__') and hasattr(argInstance, '__name__'):
+                # This is a callable object, likely a C++ kernel
+                devKey = f"{argInstance.__module__}.{argInstance.__name__}"
+                if cudaq_runtime.isRegisteredDeviceModule(devKey):
+                    print("Found registered device module for callable object:", devKey)
+                    if "module" in kwargs:
+                        module = kwargs['module']
+                        maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
+                            module, devKey)
+                        if maybeKernelName != None:
+                            otherKernel = SymbolTable(
+                                module.operation)[maybeKernelName]
+                            print("Found registered C++ kernel:", maybeKernelName)
+                            print("Other kernel type:", otherKernel.type)
+                            print("Other kernel:", otherKernel)
+                            if isinstance(otherKernel, func.FuncOp):
+                                print("HEY:", dir(otherKernel.type))
+                                print("HOW:", otherKernel.arguments)
+                                argTypes = []
+                                for arg in otherKernel.arguments:
+                                    print("ARG TYPE:", arg.type)
+                                    argTypes.append(arg.type)
+                                return cc.CallableType.get(argTypes, ctx)
+                            else:
+                                emitFatalError(
+                                    f"Registered C++ kernel '{maybeKernelName}' is not of CallableType."
+                                )
+                    # maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(self.module, devKey)
+                    # if maybeKernelName != None:
+                    #     otherKernel = SymbolTable(
+                    #         self.module.operation)[maybeKernelName]
+                    #     processFunctionCall(otherKernel.type, len(node.args))
 
     for name in globalRegisteredTypes.classes:
         customTy, memberTys = globalRegisteredTypes.getClassAttributes(name)
diff --git a/python/runtime/cudaq/algorithms/py_run.cpp b/python/runtime/cudaq/algorithms/py_run.cpp
index 8cd38a7295b..ef59b14f461 100644
--- a/python/runtime/cudaq/algorithms/py_run.cpp
+++ b/python/runtime/cudaq/algorithms/py_run.cpp
@@ -39,7 +39,8 @@ static std::vector<py::object> readRunResults(mlir::ModuleOp module,
 }
 
 static std::tuple<std::string, MlirModule, OpaqueArguments *,
-                  mlir::func::FuncOp, std::string, mlir::func::FuncOp>
+                  mlir::func::FuncOp, std::string, mlir::func::FuncOp,
+                  std::vector<std::string>>
 getKernelLaunchParameters(py::object &kernel, py::args args) {
   if (!py::hasattr(kernel, "arguments"))
     throw std::runtime_error(
@@ -52,6 +53,11 @@ getKernelLaunchParameters(py::object &kernel, py::args args) {
   if (py::hasattr(kernel, "compile"))
     kernel.attr("compile")();
 
+  std::vector<std::string> callableNames;
+  if (py::hasattr(kernel, "getCallableNames"))
+    callableNames =
+        kernel.attr("getCallableNames")(*args).cast<std::vector<std::string>>();
+
   auto origKernName = kernel.attr("name").cast<std::string>();
   auto kernelName = origKernName + ".run";
   if (!py::hasattr(kernel, "module") || kernel.attr("module").is_none())
@@ -78,7 +84,8 @@ getKernelLaunchParameters(py::object &kernel, py::args args) {
   }
   auto *argData = toOpaqueArgs(args, kernelMod, kernelName);
   auto funcOp = getKernelFuncOp(kernelMod, kernelName);
-  return {kernelName, kernelMod, argData, funcOp, origKernName, origKern};
+  return {kernelName,   kernelMod, argData,      funcOp,
+          origKernName, origKern,  callableNames};
 }
 
 static details::RunResultSpan
@@ -86,6 +93,7 @@ pyRunTheKernel(const std::string &name, const std::string &origName,
                MlirModule module, mlir::func::FuncOp funcOp,
                mlir::func::FuncOp origKernel, OpaqueArguments &runtimeArgs,
                quantum_platform &platform, std::size_t shots_count,
+               const std::vector<std::string> &callableNames,
                std::size_t qpu_id = 0) {
   auto returnTypes = origKernel.getResultTypes();
   if (returnTypes.empty() || returnTypes.size() > 1)
@@ -93,21 +101,24 @@ pyRunTheKernel(const std::string &name, const std::string &origName,
         "`cudaq.run` only supports kernels that return a value.");
 
   auto returnTy = returnTypes[0];
-  // Disallow returning list / vectors from entry-point kernels.
-  if (returnTy.isa<cc::StdvecType>()) {
-    throw std::runtime_error("`cudaq.run` does not yet support returning "
-                             "`list` from entry-point kernels.");
+  // Disallow returning nested vectors from entry-point kernels.
+  if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(returnTy)) {
+    auto elemTy = vecTy.getElementType();
+    if (elemTy.isa<cudaq::cc::StdvecType>())
+      throw std::runtime_error(
+          "`cudaq.run` does not yet support returning nested `list` from "
+          "entry-point kernels.");
   }
 
   auto mod = unwrap(module);
 
-  auto [rawArgs, size, returnOffset, thunk] =
-      pyAltLaunchKernelBase(name, module, returnTy, runtimeArgs, {}, 0, false);
+  auto [rawArgs, size, returnOffset, thunk] = pyAltLaunchKernelBase(
+      name, module, returnTy, runtimeArgs, callableNames, 0, false);
 
   auto results = details::runTheKernel(
       [&]() mutable {
         pyLaunchKernel(name, thunk, mod, runtimeArgs, rawArgs, size,
-                       returnOffset, {});
+                       returnOffset, callableNames);
       },
       platform, name, origName, shots_count, qpu_id);
 
@@ -133,7 +144,7 @@ std::vector<py::object> pyRun(py::object &kernel, py::args args,
   if (shots_count == 0)
     return {};
 
-  auto [name, module, argData, func, origName, origKern] =
+  auto [name, module, argData, func, origName, origKern, callableNames] =
       getKernelLaunchParameters(kernel, args);
 
   auto mod = unwrap(module);
@@ -149,7 +160,7 @@ std::vector<py::object> pyRun(py::object &kernel, py::args args,
   }
 
   auto span = pyRunTheKernel(name, origName, module, func, origKern, *argData,
-                             platform, shots_count);
+                             platform, shots_count, callableNames);
   delete argData;
   auto results = pyReadResults(span, module, func, origKern, shots_count);
 
@@ -184,7 +195,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args,
                              ") exceeds the number of available QPUs (" +
                              std::to_string(numQPUs) + ")");
 
-  auto [name, module, argData, func, origName, origKern] =
+  auto [name, module, argData, func, origName, origKern, callableNames] =
       getKernelLaunchParameters(kernel, args);
 
   auto mod = unwrap(module);
@@ -219,7 +230,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args,
     QuantumTask wrapped = detail::make_copyable_function(
         [sp = std::move(spanPromise), ep = std::move(errorPromise), shots_count,
          qpu_id, argData, name, module, func, origKern, origName,
-         noise_model = std::move(noise_model)]() mutable {
+         noise_model = std::move(noise_model), callableNames]() mutable {
           auto &platform = get_platform();
 
           // Launch the kernel in the appropriate context.
@@ -227,8 +238,9 @@ async_run_result pyRunAsync(py::object &kernel, py::args args,
             platform.set_noise(&noise_model.value());
 
           try {
-            auto span = pyRunTheKernel(name, origName, module, func, origKern,
-                                       *argData, platform, shots_count, qpu_id);
+            auto span =
+                pyRunTheKernel(name, origName, module, func, origKern, *argData,
+                               platform, shots_count, callableNames, qpu_id);
             delete argData;
             sp.set_value(span);
             ep.set_value("");
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 9db3e9e431f..16ba64bd849 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -117,8 +117,22 @@ OpaqueArguments *toOpaqueArgs(py::args &args, MlirModule mod,
   auto *argData = new cudaq::OpaqueArguments();
   args = simplifiedValidateInputArguments(args);
   setDataLayout(mod);
-  cudaq::packArgs(*argData, args, kernelFunc,
-                  [](OpaqueArguments &, py::object &) { return false; });
+  auto callableArgHandler = [](cudaq::OpaqueArguments &argData,
+                               py::object &arg) {
+    if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) {
+      printf("Handling callable argument.\n");
+      // Just give it some dummy data that will not be used.
+      // We synthesize away all callables, the block argument
+      // remains but it is not used, so just give argsCreator
+      // something, and we'll make sure its cleaned up.
+      long *ourAllocatedArg = new long();
+      argData.emplace_back(ourAllocatedArg,
+                           [](void *ptr) { delete static_cast<long *>(ptr); });
+      return true;
+    }
+    return false;
+  };
+  cudaq::packArgs(*argData, args, kernelFunc, callableArgHandler);
   return argData;
 }
 
@@ -157,7 +171,6 @@ ExecutionEngine *jitKernel(const std::string &name, MlirModule module,
     pm.addPass(cudaq::opt::createGenerateKernelExecution(
         {.startingArgIdx = startingArgIdx}));
     pm.addPass(cudaq::opt::createGenerateDeviceCodeLoader({.jitTime = true}));
-    pm.addPass(cudaq::opt::createReturnToOutputLog());
     pm.addPass(cudaq::opt::createLambdaLiftingPass());
     pm.addPass(cudaq::opt::createDistributedDeviceCall());
     std::string tl = getTransportLayer();
@@ -947,7 +960,8 @@ void bindAltLaunchKernel(py::module &mod,
 
   auto callableArgHandler = [](cudaq::OpaqueArguments &argData,
                                py::object &arg) {
-    if (py::hasattr(arg, "module")) {
+    if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) {
+      printf("Handling callable argument.\n");
       // Just give it some dummy data that will not be used.
       // We synthesize away all callables, the block argument
       // remains but it is not used, so just give argsCreator
diff --git a/python/tests/kernel/test_return_vectors.py b/python/tests/kernel/test_return_vectors.py
new file mode 100644
index 00000000000..376aab05613
--- /dev/null
+++ b/python/tests/kernel/test_return_vectors.py
@@ -0,0 +1,132 @@
+import cudaq
+import pytest
+import os
+
+
+def testReturnVectorBool():
+
+    @cudaq.kernel
+    def return_vec_bool() -> list[bool]:
+        ret = [True, False]
+        return ret
+
+    res = cudaq.run(return_vec_bool, shots_count=1)
+    assert res == [[True, False]]
+
+    @cudaq.kernel
+    def return_vec_bool_from_measure() -> list[bool]:
+        q = cudaq.qvector(5)
+        x(q)
+        ret = mz(q)
+        return ret
+
+    res = cudaq.run(return_vec_bool_from_measure, shots_count=10)
+    assert res == [[True] * 5] * 10
+
+    @cudaq.kernel
+    def return_vec_bool_from_measure_mix() -> list[bool]:
+        q = cudaq.qvector(5)
+        for i in range(5):
+            if i % 2 == 0:
+                x(q[i])
+        ret = mz(q)
+        return ret
+
+    res = cudaq.run(return_vec_bool_from_measure_mix, shots_count=10)
+    assert res == [[True, False, True, False, True]] * 10
+
+
+def testReturnVectorInt():
+
+    @cudaq.kernel
+    def return_vec_int() -> list[int]:
+        ret = [1, 2, 3]
+        return ret
+
+    res = cudaq.run(return_vec_int, shots_count=1)
+    assert res == [[1, 2, 3]]
+
+    @cudaq.kernel
+    def return_vec_int_from_measure() -> list[int]:
+        q = cudaq.qvector(5)
+        x(q)
+        ret = mz(q)
+        int_ret = [0 for b in ret]
+        i = 0
+        for b in ret:
+            if b:
+                int_ret[i] = 6
+            i += 1
+        return int_ret
+
+    res = cudaq.run(return_vec_int_from_measure, shots_count=10)
+    assert res == [[6] * 5] * 10
+
+    @cudaq.kernel
+    def return_vec_int_from_measure_mix() -> list[int]:
+        q = cudaq.qvector(5)
+        for i in range(5):
+            if i % 2 == 0:
+                x(q[i])
+        ret = mz(q)
+        int_ret = [0 for b in ret]
+        i = 0
+        for b in ret:
+            if b:
+                int_ret[i] = 6
+            i += 1
+        return int_ret
+
+    res = cudaq.run(return_vec_int_from_measure_mix, shots_count=10)
+    assert res == [[6, 0, 6, 0, 6]] * 10
+
+
+def testReturnVectorFloat():
+
+    @cudaq.kernel
+    def return_vec_float() -> list[float]:
+        ret = [1.1, 2.2, 3.3]
+        return ret
+
+    res = cudaq.run(return_vec_float, shots_count=1)
+    assert res == [[1.1, 2.2, 3.3]]
+
+    @cudaq.kernel
+    def return_vec_float_from_measure() -> list[float]:
+        q = cudaq.qvector(5)
+        x(q)
+        ret = mz(q)
+        float_ret = [0.0 for b in ret]
+        i = 0
+        for b in ret:
+            if b:
+                float_ret[i] = 6.6
+            i += 1
+        return float_ret
+
+    res = cudaq.run(return_vec_float_from_measure, shots_count=10)
+    assert res == [[6.6] * 5] * 10
+
+    @cudaq.kernel
+    def return_vec_float_from_measure_mix() -> list[float]:
+        q = cudaq.qvector(5)
+        for i in range(5):
+            if i % 2 == 0:
+                x(q[i])
+        ret = mz(q)
+        float_ret = [0.0 for b in ret]
+        i = 0
+        for b in ret:
+            if b:
+                float_ret[i] = 6.6
+            i += 1
+        return float_ret
+
+    res = cudaq.run(return_vec_float_from_measure_mix, shots_count=10)
+    assert res == [[6.6, 0.0, 6.6, 0.0, 6.6]] * 10
+
+
+# leave for gdb debugging
+if __name__ == "__main__":
+    loc = os.path.abspath(__file__)
+    pytest.main([loc, "-s"])

From 7e681a3dbebf78d372ad1f48df95a0a1057cd626 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Tue, 21 Oct 2025 07:05:18 +0000
Subject: [PATCH 2/8] Add tests

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 python/cudaq/kernel/ast_bridge.py             | 27 ++++---
 .../cudaq/platform/py_alt_launch_kernel.cpp   |  2 -
 .../tests/interop/quantum_lib/quantum_lib.cpp |  8 +++
 .../tests/interop/quantum_lib/quantum_lib.h   | 10 +++
 .../test_cpp_quantum_algorithm_module.cpp     | 71 +++++++++++++++++++
 python/tests/interop/test_interop.py          | 58 +++++++++++++++
 6 files changed, 163 insertions(+), 13 deletions(-)

diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 39c64b2274f..59665ef9292 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -2555,17 +2555,22 @@ def bodyBuilder(iterVal):
                 self.pushValue(stackSlot)
                 return
             # Check generic callable objects that may be C++ kernels
-            elif hasattr(var, '__call__') and hasattr(var, '__module__') and hasattr(var, '__name__'):
-                    # This is a callable object, likely a C++ kernel
-                    devKey = f"{var.__module__}.{var.__name__}"
-                    if cudaq_runtime.isRegisteredDeviceModule(devKey):
-                        maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
-                            self.module, devKey)
-                        if maybeKernelName != None:
-                            otherKernel = SymbolTable(
-                                self.module.operation)[maybeKernelName]
-                            processFunctionCall(otherKernel.type, len(node.args))
-                            return
+            elif hasattr(var, '__call__'):
+                # This is a callable object, which could be a C++ kernel
+                # Get the full module + name key and see if it is registered
+                modulePath = str(var.__module__) if hasattr(
+                    var, '__module__') else ''
+                funcName = str(var.__name__) if hasattr(
+                    var, '__name__') else ''
+                devKey = f"{modulePath}.{funcName}"
+                if cudaq_runtime.isRegisteredDeviceModule(devKey):
+                    maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
+                        self.module, devKey)
+                    if maybeKernelName != None:
+                        otherKernel = SymbolTable(
+                            self.module.operation)[maybeKernelName]
+                        processFunctionCall(otherKernel.type, len(node.args))
+                        return
             else:
                 self.emitFatalError(
                     "unhandled function call - {}, known kernels are {}".format(
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 16ba64bd849..5543a54b37d 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -120,7 +120,6 @@ OpaqueArguments *toOpaqueArgs(py::args &args, MlirModule mod,
   auto callableArgHandler = [](cudaq::OpaqueArguments &argData,
                                py::object &arg) {
     if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) {
-      printf("Handling callable argument.\n");
       // Just give it some dummy data that will not be used.
       // We synthesize away all callables, the block argument
       // remains but it is not used, so just give argsCreator
@@ -961,7 +960,6 @@ void bindAltLaunchKernel(py::module &mod,
   auto callableArgHandler = [](cudaq::OpaqueArguments &argData,
                                py::object &arg) {
     if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) {
-      printf("Handling callable argument.\n");
       // Just give it some dummy data that will not be used.
       // We synthesize away all callables, the block argument
       // remains but it is not used, so just give argsCreator
diff --git a/python/tests/interop/quantum_lib/quantum_lib.cpp b/python/tests/interop/quantum_lib/quantum_lib.cpp
index 7c5cbb23054..ed313b32f88 100644
--- a/python/tests/interop/quantum_lib/quantum_lib.cpp
+++ b/python/tests/interop/quantum_lib/quantum_lib.cpp
@@ -29,4 +29,12 @@ __qpu__ void qft(cudaq::qview<> qubits, const std::vector<double> &x,
 __qpu__ void another(cudaq::qview<> qubits, std::size_t i) { x(qubits[i]); }
 
 __qpu__ void uccsd(cudaq::qview<> qubits, std::size_t) { h(qubits[0]); }
+
+__qpu__ void reset_group(patch p) {
+  for (std::size_t i = 0; i < p.data.size(); i++)
+    reset(p.data[i]);
+}
+
+__qpu__ void x_group(patch p) { x(p.data); }
+
 } // namespace cudaq
diff --git a/python/tests/interop/quantum_lib/quantum_lib.h b/python/tests/interop/quantum_lib/quantum_lib.h
index a0655099237..4b9fa371351 100644
--- a/python/tests/interop/quantum_lib/quantum_lib.h
+++ b/python/tests/interop/quantum_lib/quantum_lib.h
@@ -9,6 +9,12 @@
 
 #include "cudaq/qis/qubit_qis.h"
 
+// Custom data structure
+struct patch {
+  cudaq::qview<> data;
+  cudaq::qview<> aux;
+};
+
 namespace cudaq {
 void entryPoint(const std::function<void(cudaq::qvector<> &)> &statePrep);
 
@@ -19,4 +25,8 @@ void another(cudaq::qview<> qubits, std::size_t);
 
 void uccsd(cudaq::qview<> qubits, std::size_t);
 
+void reset_group(patch p);
+
+void x_group(patch p);
+
 } // namespace cudaq
diff --git a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
index 4ea2d2176cc..9d0b54bfa57 100644
--- a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
+++ b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
@@ -8,6 +8,7 @@
 
 #include "cudaq.h"
 #include "cudaq/algorithms/sample.h"
+#include "cudaq/qis/qkernel.h"
 #include "quantum_lib/quantum_lib.h"
 #include "runtime/interop/PythonCppInterop.h"
 #include <pybind11/pybind11.h>
@@ -15,6 +16,22 @@
 
 namespace py = pybind11;
 
+namespace {
+static std::unordered_map<std::string,
+                          cudaq::qkernel<void(cudaq::qview<>, std::size_t)>>
+    g_cppKernels_1;
+
+static std::unordered_map<std::string, cudaq::qkernel<void(patch)>>
+    g_cppKernels_2;
+
+static const bool initKernels = []() {
+  g_cppKernels_1.insert(std::make_pair("uccsd", cudaq::uccsd));
+  g_cppKernels_2.insert(std::make_pair("reset", cudaq::reset_group));
+  g_cppKernels_2.insert(std::make_pair("x", cudaq::x_group));
+  return true;
+}();
+} // namespace
+
 PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
 
   m.def("test_cpp_qalgo", [](py::object statePrepIn) {
@@ -49,4 +66,58 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
 
   cudaq::python::addDeviceKernelInterop<cudaq::qview<>, std::size_t>(
       m, "qstd", "uccsd", "");
+
+  // Convert the C++ kernel registry to Python-accessible kernels
+  auto interopSubMod = m.def_submodule("_cpp_interop_kernels");
+  static std::unordered_map<std::string, py::object> g_py_kernels;
+
+  for (auto &[name, kernel] : g_cppKernels_1) {
+    const char *qkernelName = cudaq::registry::getLinkableKernelNameOrNull(
+        cudaq::registry::__cudaq_getLinkableKernelKey(&kernel));
+    if (!qkernelName) {
+      throw std::runtime_error("Could not find registered kernel name for " +
+                               name);
+    }
+
+    std::string kernelName = qkernelName;
+    if (kernelName.starts_with("function_"))
+      kernelName = kernelName.substr(std::string("function_").length());
+
+    interopSubMod.def(
+        kernelName.c_str(), [](py::object qview, std::size_t i) {},
+        "Auto-generated one-qubit encoding kernel from C++ code");
+    cudaq::python::registerDeviceKernel(
+        interopSubMod.attr("__name__").cast<std::string>(), kernelName, "");
+    g_py_kernels.insert(
+        std::make_pair(name, interopSubMod.attr(kernelName.c_str())));
+  }
+
+  for (auto &[name, kernel] : g_cppKernels_2) {
+    const char *qkernelName = cudaq::registry::getLinkableKernelNameOrNull(
+        cudaq::registry::__cudaq_getLinkableKernelKey(&kernel));
+    if (!qkernelName) {
+      throw std::runtime_error("Could not find registered kernel name for " +
+                               name);
+    }
+
+    std::string kernelName = qkernelName;
+    if (kernelName.starts_with("function_"))
+      kernelName = kernelName.substr(std::string("function_").length());
+
+    interopSubMod.def(
+        kernelName.c_str(), [](py::object patch) {},
+        "Auto-generated one-qubit encoding kernel from C++ code");
+    cudaq::python::registerDeviceKernel(
+        interopSubMod.attr("__name__").cast<std::string>(), kernelName, "");
+    g_py_kernels.insert(
+        std::make_pair(name, interopSubMod.attr(kernelName.c_str())));
+  }
+
+  m.def("get_cpp_kernel", [](const std::string &name) {
+    auto it = g_py_kernels.find(name);
+    if (it == g_py_kernels.end())
+      throw std::runtime_error("No C++ kernel registered for requested name.");
+
+    return it->second;
+  });
 }
diff --git a/python/tests/interop/test_interop.py b/python/tests/interop/test_interop.py
index e63588408ac..7e1e7e05fe9 100644
--- a/python/tests/interop/test_interop.py
+++ b/python/tests/interop/test_interop.py
@@ -7,6 +7,8 @@
 # ============================================================================ #
 
 import cudaq, pytest
+from typing import Callable
+from dataclasses import dataclass
 
 cudaq_test_cpp_algo = pytest.importorskip('cudaq_test_cpp_algo')
 
@@ -242,3 +244,59 @@ def entry():
         takesCapture(spin)
 
     entry.compile()
+
+def test_cpp_qkernel():
+  # Test the `qkernel` provided in C++ via a map-like registry.
+  # This is provided as a function-like callable.
+  kernel_from_cpp_registry = cudaq_test_cpp_algo.get_cpp_kernel("uccsd")
+  
+  # Use as a capture
+  @cudaq.kernel
+  def cpp_qkernel():
+    q = cudaq.qvector(4)
+    kernel_from_cpp_registry(q, 0)
+
+  cpp_qkernel()
+
+
+  # Use as a callable argument
+  @cudaq.kernel
+  def caller(k: Callable[[cudaq.qview, int], None]):
+    q = cudaq.qvector(4)
+    k(q, 0)
+
+  caller(kernel_from_cpp_registry)
+
+
+def test_cpp_custom_struct(): 
+  # Define a struct in Python that matches the C++ struct
+  @dataclass(slots=True)
+  class patch:
+    data: cudaq.qvector
+    aux: cudaq.qvector
+  
+  reset_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("reset")
+  x_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("x")
+
+  # Use as a capture
+  @cudaq.kernel
+  def cpp_qkernel_struct():
+    q = cudaq.qvector(4)
+    r = cudaq.qvector(2)
+    x(q)
+    reset_qkernel(patch(q, r))
+
+  counts = cudaq.sample(cpp_qkernel_struct)
+  counts.dump()
+  assert len(counts) == 1 and '000000' in counts
+
+  @cudaq.kernel
+  def cpp_qkernel_struct_x():
+    q = cudaq.qvector(4)
+    r = cudaq.qvector(2)
+    x_qkernel(patch(q, r))
+
+  counts = cudaq.sample(cpp_qkernel_struct_x)
+  counts.dump()
+  assert len(counts) == 1 and '111100' in counts
+

From 7c27ba50dcea0e55e235543d9ad350e1bb397113 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Tue, 21 Oct 2025 07:09:25 +0000
Subject: [PATCH 3/8] Remove temp code

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 python/cudaq/kernel/utils.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py
index c95c9a42cdf..8c0b62327e3 100644
--- a/python/cudaq/kernel/utils.py
+++ b/python/cudaq/kernel/utils.py
@@ -443,13 +443,11 @@ def mlirTypeFromPyType(argType, ctx, **kwargs):
         argInstance = kwargs['argInstance']
         if isinstance(argInstance, Callable):
             if hasattr(argInstance, 'argTypes'):
-                print("Found Callable with argTypes:", argInstance.argTypes)
                 return cc.CallableType.get(argInstance.argTypes, ctx)
             elif hasattr(argInstance, '__call__') and hasattr(argInstance, '__module__') and hasattr(argInstance, '__name__'):
                 # This is a callable object, likely a C++ kernel
                 devKey = f"{argInstance.__module__}.{argInstance.__name__}"
                 if cudaq_runtime.isRegisteredDeviceModule(devKey):
-                    print("Found registered device module for callable object:", devKey)
                     if "module" in kwargs:
                         module = kwargs['module']
                         maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
@@ -457,26 +455,15 @@ def mlirTypeFromPyType(argType, ctx, **kwargs):
                         if maybeKernelName != None:
                             otherKernel = SymbolTable(
                                 module.operation)[maybeKernelName]
-                            print("Found registered C++ kernel:", maybeKernelName)
-                            print("Other kernel type:", otherKernel.type)
-                            print("Other kernel:", otherKernel)
                             if isinstance(otherKernel, func.FuncOp):
-                                print("HEY:", dir(otherKernel.type))
-                                print("HOW:", otherKernel.arguments)
                                 argTypes = []
                                 for arg in otherKernel.arguments:
-                                    print("ARG TYPE:", arg.type)
                                     argTypes.append(arg.type)
                                 return cc.CallableType.get(argTypes, ctx)
                             else:
                                 emitFatalError(
                                     f"Registered C++ kernel '{maybeKernelName}' is not of CallableType."
                                 )
-                    # maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(self.module, devKey)
-                    # if maybeKernelName != None:
-                    #     otherKernel = SymbolTable(
-                    #         self.module.operation)[maybeKernelName]
-                    #     processFunctionCall(otherKernel.type, len(node.args))
 
     for name in globalRegisteredTypes.classes:
         customTy, memberTys = globalRegisteredTypes.getClassAttributes(name)

From d4579af48d70d6df7c2acdbe905172bbb721952c Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Tue, 21 Oct 2025 07:19:31 +0000
Subject: [PATCH 4/8] Add to_integer test for Python

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 python/tests/kernel/test_to_integer.py | 41 ++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 python/tests/kernel/test_to_integer.py

diff --git a/python/tests/kernel/test_to_integer.py b/python/tests/kernel/test_to_integer.py
new file mode 100644
index 00000000000..959341a3698
--- /dev/null
+++ b/python/tests/kernel/test_to_integer.py
@@ -0,0 +1,41 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+import pytest
+import os
+import cudaq
+
+def testToInteger():
+    @cudaq.kernel
+    def toIntegerKernel(applyX: list[int]) -> int:
+        q = cudaq.qvector(len(applyX))
+        for i in range(len(applyX)):
+            if applyX[i]:
+                x(q[i])
+        return cudaq.to_integer(mz(q))
+    
+    test_cases = [
+        [1, 1, 1],
+        [1, 1, 1, 1],
+        [1, 0, 1],
+        [1, 0, 0, 0],
+        [0, 0, 0, 1],
+    ]
+
+    # See reference: targettests/execution/to_integer.cpp
+    expected_results = [7, 15, 5, 1, 8]
+    for applyX in test_cases:
+        counts = cudaq.run(toIntegerKernel, applyX)
+        # All shots should yield the same integer result
+        for result in counts:
+            assert result == expected_results[test_cases.index(applyX)]
+
+
+# leave for gdb debugging
+if __name__ == "__main__":
+    loc = os.path.abspath(__file__)
+    pytest.main([loc, "-rP"])
\ No newline at end of file

From d06fb9333034cc048a5c18f3f6673742b008e198 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 22 Oct 2025 02:50:53 +0000
Subject: [PATCH 5/8] Unblock all Python run tests for returning vectors

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 lib/Optimizer/Builder/Intrinsics.cpp         | 130 ++++++++++++
 lib/Optimizer/CodeGen/Pipelines.cpp          |   1 +
 lib/Optimizer/CodeGen/ReturnToOutputLog.cpp  | 155 +++++++++++++-
 python/cudaq/kernel/ast_bridge.py            |  34 +--
 python/tests/kernel/test_return_vectors.py   | 132 ------------
 python/tests/kernel/test_run_async_kernel.py | 149 +++++++------
 python/tests/kernel/test_run_kernel.py       | 208 +++++++++++++------
 7 files changed, 524 insertions(+), 285 deletions(-)
 delete mode 100644 python/tests/kernel/test_return_vectors.py

diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index fd7622981fb..2931adefae7 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -371,6 +371,136 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     {cudaq::stdvecBoolCtorFromInitList, {}, R"#(
   func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64) -> ())#"},
 
+    {"__nvqpp_internal_number_of_digits", {}, R"#(
+  func.func private @__nvqpp_internal_number_of_digits(%arg0: i64) -> i64 {
+    %c10_i64 = arith.constant 10 : i64 
+    %c0_i64 = arith.constant 0 : i64 
+    %c1_i64 = arith.constant 1 : i64 
+    %0 = cc.alloca i64 
+    cc.store %arg0, %0 : !cc.ptr<i64> 
+    %1 = cc.load %0 : !cc.ptr<i64> 
+    %2 = cc.alloca i64 
+    cc.store %c0_i64, %2 : !cc.ptr<i64> 
+    %3 = arith.cmpi eq, %1, %c0_i64 : i64 
+    cc.if(%3) {
+      cc.store %c1_i64, %2 : !cc.ptr<i64> 
+    } 
+    cc.loop while {
+      %5 = cc.load %0 : !cc.ptr<i64> 
+      %6 = arith.cmpi sgt, %5, %c0_i64 : i64 
+      cc.condition %6 
+    } do {
+      %5 = cc.load %0 : !cc.ptr<i64> 
+      %6 = arith.divsi %5, %c10_i64 : i64 
+      cc.store %6, %0 : !cc.ptr<i64> 
+      %7 = cc.load %2 : !cc.ptr<i64> 
+      %8 = arith.addi %7, %c1_i64 : i64 
+      cc.store %8, %2 : !cc.ptr<i64> 
+      cc.continue 
+    } 
+    %4 = cc.load %2 : !cc.ptr<i64> 
+    return %4 : i64 
+  } 
+  )#"},
+
+    // __nvqpp_internal_tostring
+    {"__nvqpp_internal_tostring", {}, R"#(
+  func.func private @__nvqpp_internal_tostring(%buf: !cc.stdvec<i8>, %val: i64) {
+    %c48_i64 = arith.constant 48 : i64 
+    %c48_i32 = arith.constant 48 : i32 
+    %c0_i64 = arith.constant 0 : i64 
+    %c10_i64 = arith.constant 10 : i64 
+    %c1_i64 = arith.constant 1 : i64 
+    %c48_i8 = arith.constant 48 : i8 
+    %false = arith.constant false 
+    %c0_i8 = arith.constant 0 : i8 
+    %0 = cc.alloca i64 
+    cc.store %val, %0 : !cc.ptr<i64> 
+    %1 = cc.alloca i64 
+    cc.store %c10_i64, %1 : !cc.ptr<i64> 
+    %2 = cc.stdvec_size %buf : (!cc.stdvec<i8>) -> i64 
+    %3 = cc.alloca i64 
+    cc.store %2, %3 : !cc.ptr<i64> 
+    %4 = cc.load %3 : !cc.ptr<i64> 
+    %5 = arith.subi %4, %c1_i64 : i64 
+    %6 = cc.alloca i64 
+    cc.store %5, %6 : !cc.ptr<i64> 
+    %7 = cc.load %6 : !cc.ptr<i64> 
+    %8 = cc.stdvec_data %buf : (!cc.stdvec<i8>) -> !cc.ptr<!cc.array<i8 x ?>> 
+    %9 = cc.compute_ptr %8[%7] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8> 
+    cc.store %c0_i8, %9 : !cc.ptr<i8> 
+    %10 = cc.load %6 : !cc.ptr<i64> 
+    %11 = arith.subi %10, %c1_i64 : i64 
+    cc.store %11, %6 : !cc.ptr<i64> 
+    cc.loop while {
+      %18 = cc.load %0 : !cc.ptr<i64> 
+      %19 = cc.load %1 : !cc.ptr<i64> 
+      %20 = arith.cmpi sge, %18, %19 : i64 
+      %21 = arith.cmpi eq, %20, %false : i1 
+      %22 = cc.if(%21) -> i1 {
+        cc.continue %false : i1 
+      } else {
+        %23 = cc.load %6 : !cc.ptr<i64> 
+        %24 = arith.cmpi sge, %23, %c0_i64 : i64 
+        cc.continue %24 : i1 
+      } 
+      cc.condition %22 
+    } do {
+      cc.scope {
+        %18 = cc.load %0 : !cc.ptr<i64> 
+        %19 = cc.load %1 : !cc.ptr<i64> 
+        %20 = arith.remsi %18, %19 : i64 
+        %21 = cc.cast %20 : (i64) -> i32 
+        %22 = cc.alloca i32 
+        cc.store %21, %22 : !cc.ptr<i32> 
+        %23 = cc.load %1 : !cc.ptr<i64> 
+        %24 = cc.load %0 : !cc.ptr<i64> 
+        %25 = arith.divsi %24, %23 : i64 
+        cc.store %25, %0 : !cc.ptr<i64> 
+        %26 = cc.load %6 : !cc.ptr<i64> 
+        %27 = cc.stdvec_data %buf : (!cc.stdvec<i8>) -> !cc.ptr<!cc.array<i8 x ?>> 
+        %28 = cc.compute_ptr %27[%26] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8> 
+        %29 = cc.load %22 : !cc.ptr<i32> 
+        %30 = arith.addi %29, %c48_i32 : i32 
+        %31 = cc.cast %30 : (i32) -> i8 
+        cc.store %31, %28 : !cc.ptr<i8> 
+        %32 = cc.load %6 : !cc.ptr<i64> 
+        %33 = arith.subi %32, %c1_i64 : i64 
+        cc.store %33, %6 : !cc.ptr<i64> 
+      }
+      cc.continue
+    }
+    %12 = cc.load %6 : !cc.ptr<i64>
+    %13 = cc.stdvec_data %buf : (!cc.stdvec<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+    %14 = cc.compute_ptr %13[%12] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+    %15 = cc.load %0 : !cc.ptr<i64>
+    %16 = arith.addi %15, %c48_i64 : i64
+    %17 = cc.cast %16 : (i64) -> i8
+    cc.store %17, %14 : !cc.ptr<i8>
+    cc.scope {
+      %18 = cc.alloca i64
+      cc.store %c0_i64, %18 : !cc.ptr<i64>
+      cc.loop while {
+        %19 = cc.load %18 : !cc.ptr<i64>
+        %20 = cc.load %6 : !cc.ptr<i64>
+        %21 = arith.cmpi slt, %19, %20 : i64
+        cc.condition %21
+      } do {
+        %19 = cc.load %18 : !cc.ptr<i64>
+        %20 = cc.stdvec_data %buf : (!cc.stdvec<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+        %21 = cc.compute_ptr %20[%19] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+        cc.store %c48_i8, %21 : !cc.ptr<i8>
+        cc.continue
+      } step {
+        %19 = cc.load %18 : !cc.ptr<i64>
+        %20 = arith.addi %19, %c1_i64 : i64
+        cc.store %20, %18 : !cc.ptr<i64>
+      }
+    }
+    return
+  }
+  )#"},
+
     // This helper function copies a buffer off the stack to the heap. This is
     // required when the data on the stack is about to go out of scope but is
     // still live.
diff --git a/lib/Optimizer/CodeGen/Pipelines.cpp b/lib/Optimizer/CodeGen/Pipelines.cpp
index c4d0141afd4..a8a3f918968 100644
--- a/lib/Optimizer/CodeGen/Pipelines.cpp
+++ b/lib/Optimizer/CodeGen/Pipelines.cpp
@@ -98,6 +98,7 @@ void createTargetCodegenPipeline(PassManager &pm,
   pm.addNestedPass<func::FuncOp>(createCSEPass());
   ::addQIRConversionPipeline(pm, options.target);
   pm.addPass(cudaq::opt::createReturnToOutputLog());
+  cudaq::opt::addLowerToCFG(pm);
   pm.addPass(createConvertMathToFuncs());
   pm.addPass(createSymbolDCEPass());
   pm.addPass(cudaq::opt::createCCToLLVM());
diff --git a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
index b4b175a31dd..09f280ff255 100644
--- a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
+++ b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
@@ -46,7 +46,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
   }
 
   static void genOutputLog(Location loc, PatternRewriter &rewriter, Value val,
-                           std::optional<StringRef> prefix) {
+                           std::optional<StringRef> prefix,
+                           std::optional<Value> customLabel = std::nullopt) {
     Type valTy = val.getType();
     TypeSwitch<Type>(valTy)
         .Case([&](IntegerType intTy) {
@@ -54,7 +55,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
           std::string labelStr = std::string("i") + std::to_string(width);
           if (prefix)
             labelStr = prefix->str();
-          Value label = makeLabel(loc, rewriter, labelStr);
+          Value label =
+              customLabel.value_or(makeLabel(loc, rewriter, labelStr));
           if (intTy.getWidth() == 1) {
             rewriter.create<func::CallOp>(loc, TypeRange{},
                                           cudaq::opt::QIRBoolRecordOutput,
@@ -80,7 +82,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
           std::string labelStr = std::string("f") + std::to_string(width);
           if (prefix)
             labelStr = prefix->str();
-          Value label = makeLabel(loc, rewriter, labelStr);
+          Value label =
+              customLabel.value_or(makeLabel(loc, rewriter, labelStr));
           // Floating point: convert it to double, whatever it actually is.
           Value castVal = val;
           if (floatTy != rewriter.getF64Type())
@@ -94,7 +97,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
           auto labelStr = translateType(structTy);
           if (prefix)
             labelStr = prefix->str();
-          Value label = makeLabel(loc, rewriter, labelStr);
+          Value label =
+              customLabel.value_or(makeLabel(loc, rewriter, labelStr));
           std::int32_t sz = structTy.getNumMembers();
           Value size = rewriter.create<arith::ConstantIntOp>(loc, sz, 64);
           rewriter.create<func::CallOp>(loc, TypeRange{},
@@ -111,7 +115,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
         })
         .Case([&](cudaq::cc::ArrayType arrTy) {
           auto labelStr = translateType(arrTy);
-          Value label = makeLabel(loc, rewriter, labelStr);
+          Value label =
+              customLabel.value_or(makeLabel(loc, rewriter, labelStr));
           std::int32_t sz = arrTy.getSize();
           Value size = rewriter.create<arith::ConstantIntOp>(loc, sz, 64);
           rewriter.create<func::CallOp>(loc, TypeRange{},
@@ -128,13 +133,12 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
           }
         })
         .Case([&](cudaq::cc::StdvecType vecTy) {
-          // For this type, we expect a cc.stdvec_init operation as the input.
-          // The data will be in a variable.
-          // If we reach here and we cannot determine the constant size of the
-          // buffer, then we will not generate any output logging.
           if (auto vecInit = val.getDefiningOp<cudaq::cc::StdvecInitOp>())
             if (auto maybeLen = cudaq::opt::factory::maybeValueOfIntConstant(
                     vecInit.getLength())) {
+              // For this type, we expect a cc.stdvec_init operation as the
+              // input.
+              // The data will be in a variable.
               std::int32_t sz = *maybeLen;
               auto labelStr = translateType(vecTy, sz);
               Value label = makeLabel(loc, rewriter, labelStr);
@@ -159,6 +163,53 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
                 genOutputLog(loc, rewriter, w, offset);
               }
             }
+
+          // If we reach here and we cannot determine the constant size of the
+          // buffer, then we will not generate dynamic output logging with a for
+          // loop.
+          Value vecSz = rewriter.template create<cudaq::cc::StdvecSizeOp>(
+              loc, rewriter.getI64Type(), val);
+          const std::string arrayLabelPrefix =
+              "array<" + translateType(vecTy.getElementType()) + " x ";
+          Value labelBuffer =
+              makeLabel(loc, rewriter, arrayLabelPrefix, vecSz, ">");
+          rewriter.create<func::CallOp>(loc, TypeRange{},
+                                        cudaq::opt::QIRArrayRecordOutput,
+                                        ArrayRef<Value>{vecSz, labelBuffer});
+          auto eleTy = vecTy.getElementType();
+          const bool isBool = (eleTy == rewriter.getI1Type());
+          if (isBool)
+            eleTy = rewriter.getI8Type();
+          auto elePtrTy = cudaq::cc::PointerType::get(eleTy);
+          auto eleArrTy =
+              cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
+          auto vecPtr =
+              rewriter.create<cudaq::cc::StdvecDataOp>(loc, eleArrTy, val);
+          const std::string preStr = prefix ? prefix->str() : std::string{};
+          cudaq::opt::factory::createInvariantLoop(
+              rewriter, loc, vecSz,
+              [&](OpBuilder &builder, Location loc, Region &, Block &block) {
+                Value indexVar = block.getArgument(0);
+                auto eleAddr = rewriter.create<cudaq::cc::ComputePtrOp>(
+                    loc, elePtrTy, vecPtr, ValueRange{indexVar});
+
+                Value w = [&]() {
+                  if (isBool) {
+                    auto i1PtrTy =
+                        cudaq::cc::PointerType::get(rewriter.getI1Type());
+                    auto i1Cast = rewriter.create<cudaq::cc::CastOp>(
+                        loc, i1PtrTy, eleAddr);
+                    return rewriter.create<cudaq::cc::LoadOp>(loc, i1Cast);
+                  }
+
+                  return rewriter.create<cudaq::cc::LoadOp>(loc, eleAddr);
+                }();
+                const std::string prefix = preStr + "[";
+                const std::string postfix = "]";
+                Value dynamicLabel =
+                    makeLabel(loc, rewriter, prefix, indexVar, postfix);
+                genOutputLog(loc, rewriter, w, std::nullopt, dynamicLabel);
+              });
         })
         .Default([&](Type) {
           // If we reach here, we don't know how to handle this type.
@@ -207,6 +258,79 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
     auto i8PtrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
     return rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, lit);
   }
+
+  static Value makeLabel(Location loc, PatternRewriter &rewriter,
+                         const std::string &prefix, Value val,
+                         const std::string &postFix) {
+    auto i64Ty = rewriter.getI64Type();
+    auto i8Ty = rewriter.getI8Type();
+    auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty);
+    // Value must be i64
+    if (val.getType() != i64Ty)
+      val = rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, val);
+    // Compute the number of digits required
+    Value numDigits = rewriter
+                          .create<func::CallOp>(
+                              loc, i64Ty, "__nvqpp_internal_number_of_digits",
+                              ArrayRef<Value>{val})
+                          .getResult(0);
+    Value valStrBuf = [&]() {
+      // Convert integer value to string
+      auto strSize = rewriter.create<arith::AddIOp>(
+          loc, numDigits,
+          rewriter.create<arith::ConstantIntOp>(loc, 1,
+                                                64)); // Add null terminator
+      auto buffer = rewriter.create<cudaq::cc::AllocaOp>(loc, i8Ty, strSize);
+      auto stdvecTy = cudaq::cc::StdvecType::get(i8Ty);
+      auto stringCharVec = rewriter.create<cudaq::cc::StdvecInitOp>(
+          loc, stdvecTy, buffer, strSize);
+      rewriter.create<func::CallOp>(loc, TypeRange{},
+                                    "__nvqpp_internal_tostring",
+                                    ArrayRef<Value>{stringCharVec, val});
+      return rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, buffer);
+    }();
+
+    Value arrayPrefix = makeLabel(loc, rewriter, prefix);
+    Value arrayPostfix = makeLabel(loc, rewriter, postFix);
+    const int preFixLen = prefix.size();
+    const int postFixLen = postFix.size();
+    Value totalStrSize = rewriter.create<arith::AddIOp>(
+        loc, numDigits,
+        rewriter.create<arith::ConstantIntOp>(loc, preFixLen + postFixLen + 1,
+                                              64));
+    auto labelBufferAlloc =
+        rewriter.create<cudaq::cc::AllocaOp>(loc, i8Ty, totalStrSize);
+    Value labelBuffer =
+        rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, labelBufferAlloc);
+
+    // Copy the prefix
+    rewriter.create<func::CallOp>(
+        loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
+        ValueRange{labelBuffer, arrayPrefix,
+                   rewriter.create<arith::ConstantIntOp>(loc, preFixLen, 64),
+                   rewriter.create<arith::ConstantIntOp>(loc, 0, 1)});
+    // Copy the integer string
+    auto toPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
+        loc, i8PtrTy, labelBufferAlloc,
+        ValueRange{rewriter.create<arith::ConstantIntOp>(loc, preFixLen, 64)});
+    rewriter.create<func::CallOp>(
+        loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
+        ValueRange{toPtr, valStrBuf, numDigits,
+                   rewriter.create<arith::ConstantIntOp>(loc, 0, 1)});
+    // Copy the postfix + null terminator
+    Value shift = rewriter.create<arith::AddIOp>(
+        loc, numDigits,
+        rewriter.create<arith::ConstantIntOp>(loc, preFixLen, 64));
+    toPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
+        loc, i8PtrTy, labelBufferAlloc, ValueRange{shift});
+    rewriter.create<func::CallOp>(
+        loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
+        ValueRange{
+            toPtr, arrayPostfix,
+            rewriter.create<arith::ConstantIntOp>(loc, postFixLen + 1, 64),
+            rewriter.create<arith::ConstantIntOp>(loc, 0, 1)});
+    return labelBuffer;
+  }
 };
 
 struct ReturnToOutputLogPass
@@ -230,6 +354,19 @@ struct ReturnToOutputLogPass
       return;
     }
 
+    if (failed(irBuilder.loadIntrinsic(module, "__nvqpp_internal_tostring"))) {
+      module.emitError("could not load string conversion function.");
+      signalPassFailure();
+      return;
+    }
+
+    if (failed(irBuilder.loadIntrinsic(module,
+                                       "__nvqpp_internal_number_of_digits"))) {
+      module.emitError("could not load number of digits function.");
+      signalPassFailure();
+      return;
+    }
+
     RewritePatternSet patterns(ctx);
     patterns.insert<ReturnRewrite>(ctx);
     LLVM_DEBUG(llvm::dbgs() << "Before return to output logging:\n" << module);
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 59665ef9292..3415810819d 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -2430,6 +2430,23 @@ def bodyBuilder(iterVal):
                     # kernel registry correctly for the next conditional check
                     if var.name in globalKernelRegistry:
                         node.func.id = var.name
+                # Check generic callable objects that may be C++ `qkernel` (with its MLIR code registered)
+                elif hasattr(var, '__call__'):
+                    # This is a callable object, which could be a C++ kernel
+                    # Get the full module + name key and see if it is registered
+                    modulePath = str(var.__module__) if hasattr(
+                        var, '__module__') else ''
+                    funcName = str(var.__name__) if hasattr(
+                        var, '__name__') else ''
+                    devKey = f"{modulePath}.{funcName}"
+                    if cudaq_runtime.isRegisteredDeviceModule(devKey):
+                        maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
+                            self.module, devKey)
+                        if maybeKernelName != None:
+                            otherKernel = SymbolTable(
+                                self.module.operation)[maybeKernelName]
+                            processFunctionCall(otherKernel.type, len(node.args))
+                            return
 
             if node.func.id in globalKernelRegistry:
                 # If in `globalKernelRegistry`, it has to be in this Module
@@ -2554,23 +2571,6 @@ def bodyBuilder(iterVal):
                     cc.StoreOp(ctorArgs[i], eleAddr)
                 self.pushValue(stackSlot)
                 return
-            # Check generic callable objects that may be C++ kernels
-            elif hasattr(var, '__call__'):
-                # This is a callable object, which could be a C++ kernel
-                # Get the full module + name key and see if it is registered
-                modulePath = str(var.__module__) if hasattr(
-                    var, '__module__') else ''
-                funcName = str(var.__name__) if hasattr(
-                    var, '__name__') else ''
-                devKey = f"{modulePath}.{funcName}"
-                if cudaq_runtime.isRegisteredDeviceModule(devKey):
-                    maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
-                        self.module, devKey)
-                    if maybeKernelName != None:
-                        otherKernel = SymbolTable(
-                            self.module.operation)[maybeKernelName]
-                        processFunctionCall(otherKernel.type, len(node.args))
-                        return
             else:
                 self.emitFatalError(
                     "unhandled function call - {}, known kernels are {}".format(
diff --git a/python/tests/kernel/test_return_vectors.py b/python/tests/kernel/test_return_vectors.py
deleted file mode 100644
index 376aab05613..00000000000
--- a/python/tests/kernel/test_return_vectors.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import cudaq
-import pytest
-import os
-
-
-def testReturnVectorBool():
-
-    @cudaq.kernel
-    def return_vec_bool() -> list[bool]:
-        ret = [True, False]
-        return ret
-
-    res = cudaq.run(return_vec_bool, shots_count=1)
-    assert res == [[True, False]]
-
-    @cudaq.kernel
-    def return_vec_bool_from_measure() -> list[bool]:
-        q = cudaq.qvector(5)
-        x(q)
-        ret = mz(q)
-        return ret
-
-    res = cudaq.run(return_vec_bool_from_measure, shots_count=10)
-    assert res == [[True] * 5] * 10
-
-    @cudaq.kernel
-    def return_vec_bool_from_measure_mix() -> list[bool]:
-        q = cudaq.qvector(5)
-        for i in range(5):
-            if i % 2 == 0:
-                x(q[i])
-        ret = mz(q)
-        return ret
-
-    res = cudaq.run(return_vec_bool_from_measure_mix, shots_count=10)
-    assert res == [[True, False, True, False, True]] * 10
-
-
-def testReturnVectorInt():
-
-    @cudaq.kernel
-    def return_vec_int() -> list[int]:
-        ret = [1, 2, 3]
-        return ret
-
-    res = cudaq.run(return_vec_int, shots_count=1)
-    assert res == [[1, 2, 3]]
-
-    @cudaq.kernel
-    def return_vec_int_from_measure() -> list[int]:
-        q = cudaq.qvector(5)
-        x(q)
-        ret = mz(q)
-        int_ret = [0 for b in ret]
-        i = 0
-        for b in ret:
-            if b:
-                int_ret[i] = 6
-            i += 1
-        return int_ret
-
-    res = cudaq.run(return_vec_int_from_measure, shots_count=10)
-    assert res == [[6] * 5] * 10
-
-    @cudaq.kernel
-    def return_vec_int_from_measure_mix() -> list[int]:
-        q = cudaq.qvector(5)
-        for i in range(5):
-            if i % 2 == 0:
-                x(q[i])
-        ret = mz(q)
-        int_ret = [0 for b in ret]
-        i = 0
-        for b in ret:
-            if b:
-                int_ret[i] = 6
-            i += 1
-        return int_ret
-
-    res = cudaq.run(return_vec_int_from_measure_mix, shots_count=10)
-    assert res == [[6, 0, 6, 0, 6]] * 10
-
-
-def testReturnVectorFloat():
-
-    @cudaq.kernel
-    def return_vec_float() -> list[float]:
-        ret = [1.1, 2.2, 3.3]
-        return ret
-
-    res = cudaq.run(return_vec_float, shots_count=1)
-    assert res == [[1.1, 2.2, 3.3]]
-
-    @cudaq.kernel
-    def return_vec_float_from_measure() -> list[float]:
-        q = cudaq.qvector(5)
-        x(q)
-        ret = mz(q)
-        float_ret = [0.0 for b in ret]
-        i = 0
-        for b in ret:
-            if b:
-                float_ret[i] = 6.6
-            i += 1
-        return float_ret
-
-    res = cudaq.run(return_vec_float_from_measure, shots_count=10)
-    assert res == [[6.6] * 5] * 10
-
-    @cudaq.kernel
-    def return_vec_float_from_measure_mix() -> list[float]:
-        q = cudaq.qvector(5)
-        for i in range(5):
-            if i % 2 == 0:
-                x(q[i])
-        ret = mz(q)
-        float_ret = [0.0 for b in ret]
-        i = 0
-        for b in ret:
-            if b:
-                float_ret[i] = 6.6
-            i += 1
-        return float_ret
-
-    res = cudaq.run(return_vec_float_from_measure_mix, shots_count=10)
-    assert res == [[6.6, 0.0, 6.6, 0.0, 6.6]] * 10
-
-
-# leave for gdb debugging
-if __name__ == "__main__":
-    loc = os.path.abspath(__file__)
-    pytest.main([loc, "-s"])
diff --git a/python/tests/kernel/test_run_async_kernel.py b/python/tests/kernel/test_run_async_kernel.py
index fc1c0ac3aae..31796c18e15 100644
--- a/python/tests/kernel/test_run_async_kernel.py
+++ b/python/tests/kernel/test_run_async_kernel.py
@@ -14,8 +14,6 @@
 import numpy as np
 import pytest
 
-list_err_msg = 'does not yet support returning `list` from entry-point kernels'
-
 
 def is_close(actual, expected):
     return np.isclose(actual, expected, atol=1e-6)
@@ -338,38 +336,41 @@ def test_return_list_bool():
     def simple_list_bool_no_args() -> list[bool]:
         return [True, False, True]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_bool_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_bool_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
     @cudaq.kernel
     def simple_list_bool(n: int) -> list[bool]:
         qubits = cudaq.qvector(n)
         return [True, False, True]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_bool, 2, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_bool, 2, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
     @cudaq.kernel
     def simple_list_bool_args(n: int, t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_bool_args, 2, [True, False, True]).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_bool_args, 2, [True, False, True], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
     @cudaq.kernel
     def simple_list_bool_args_no_broadcast(t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(2)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_bool_args_no_broadcast,
-                        [True, False, True]).get()
-    assert list_err_msg in str(e.value)
-
+    results = cudaq.run_async(simple_list_bool_args_no_broadcast,
+                        [True, False, True], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
 def test_return_list_int():
 
@@ -377,18 +378,21 @@ def test_return_list_int():
     def simple_list_int_no_args() -> list[int]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int(n: int, t: list[int]) -> list[int]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int, 2, [-13, 5, 42], shots_count=2).get()
-    assert list_err_msg in str(e.value)
+
+    results = cudaq.run_async(simple_list_int, 2, [-13, 5, 42], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int8():
@@ -397,18 +401,22 @@ def test_return_list_int8():
     def simple_list_int8_no_args() -> list[np.int8]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int8_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    
+    results = cudaq.run_async(simple_list_int8_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int8(n: int, t: list[np.int8]) -> list[np.int8]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int8, 2, [-13, 5, 42], shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    
+    results = cudaq.run_async(simple_list_int8, 2, [-13, 5, 42], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int16():
@@ -417,18 +425,20 @@ def test_return_list_int16():
     def simple_list_int16_no_args() -> list[np.int16]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int16_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int16_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int16(n: int, t: list[np.int16]) -> list[np.int16]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int16, 2, [-13, 5, 42], shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int16, 2, [-13, 5, 42], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int32():
@@ -437,18 +447,20 @@ def test_return_list_int32():
     def simple_list_int32_no_args() -> list[np.int32]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int32_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int32_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int32(n: int, t: list[np.int32]) -> list[np.int32]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int32, 2, [-13, 5, 42], shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int32, 2, [-13, 5, 42], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int64():
@@ -457,18 +469,20 @@ def test_return_list_int64():
     def simple_list_int64_no_args() -> list[np.int64]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int64_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int64_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int64(n: int, t: list[np.int64]) -> list[np.int64]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int64, 2, [-13, 5, 42], shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int64, 2, [-13, 5, 42], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_float():
@@ -477,20 +491,22 @@ def test_return_list_float():
     def simple_list_float_no_args() -> list[float]:
         return [-13.2, 5., 42.99]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_float_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_float_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert np.allclose(results[0], [-13.2, 5., 42.99])
+    assert np.allclose(results[1], [-13.2, 5., 42.99])
 
     @cudaq.kernel
     def simple_list_float(n: int, t: list[float]) -> list[float]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_float,
+    results = cudaq.run_async(simple_list_float,
                         2, [-13.2, 5.0, 42.99],
                         shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    assert len(results) == 2
+    assert np.allclose(results[0], [-13.2, 5., 42.99])
+    assert np.allclose(results[1], [-13.2, 5., 42.99])
 
 
 def test_return_list_float32():
@@ -499,20 +515,22 @@ def test_return_list_float32():
     def simple_list_float32_no_args() -> list[np.float32]:
         return [-13.2, 5., 42.99]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_float32_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_float32_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert np.allclose(results[0], [-13.2, 5., 42.99])
+    assert np.allclose(results[1], [-13.2, 5., 42.99])
 
     @cudaq.kernel
     def simple_list_float32(n: int, t: list[np.float32]) -> list[np.float32]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_float32,
+    results = cudaq.run_async(simple_list_float32,
                         2, [-13.2, 5.0, 42.99],
                         shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    assert len(results) == 2
+    assert np.allclose(results[0], [-13.2, 5., 42.99])
+    assert np.allclose(results[1], [-13.2, 5., 42.99])
 
 
 def test_return_list_float64():
@@ -521,21 +539,22 @@ def test_return_list_float64():
     def simple_list_float64_no_args() -> list[np.float64]:
         return [-13.2, 5., 42.99]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_float64_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_float64_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert np.allclose(results[0], [-13.2, 5., 42.99])
+    assert np.allclose(results[1], [-13.2, 5., 42.99])
 
     @cudaq.kernel
     def simple_list_float64(n: int, t: list[np.float64]) -> list[np.float64]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_float64,
+    results = cudaq.run_async(simple_list_float64,
                         2, [-13.2, 5.0, 42.99],
                         shots_count=2).get()
-    assert list_err_msg in str(e.value)
-
+    assert len(results) == 2
+    assert np.allclose(results[0], [-13.2, 5., 42.99])
+    assert np.allclose(results[1], [-13.2, 5., 42.99])
 
 # Test tuples
 # TODO: Define spec for using tuples in kernels
diff --git a/python/tests/kernel/test_run_kernel.py b/python/tests/kernel/test_run_kernel.py
index 3e656ee16a9..d73b35fa352 100644
--- a/python/tests/kernel/test_run_kernel.py
+++ b/python/tests/kernel/test_run_kernel.py
@@ -14,8 +14,6 @@
 import warnings
 import pytest
 
-list_err_msg = 'does not yet support returning `list` from entry-point kernels'
-
 skipIfBraketNotInstalled = pytest.mark.skipif(
     not (cudaq.has_target("braket")),
     reason='Could not find `braket` in installation')
@@ -333,36 +331,41 @@ def test_return_list_bool():
     def simple_list_bool_no_args() -> list[bool]:
         return [True, False, True]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_bool_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_bool_no_args, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
     @cudaq.kernel
     def simple_list_bool(n: int) -> list[bool]:
         qubits = cudaq.qvector(n)
         return [True, False, True]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_bool, 2, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_bool, 2, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
     @cudaq.kernel
     def simple_list_bool_args(n: int, t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_bool_args, 2, [True, False, True])
-    assert list_err_msg in str(e.value)
+
+    results = cudaq.run(simple_list_bool_args, 2, [True, False, True], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
     @cudaq.kernel
     def simple_list_bool_args_no_broadcast(t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(2)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_bool_args_no_broadcast, [True, False, True])
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_bool_args_no_broadcast, [True, False, True], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
 
 def test_return_list_int():
@@ -371,18 +374,20 @@ def test_return_list_int():
     def simple_list_int_no_args() -> list[int]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int_no_args, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int(n: int, t: list[int]) -> list[int]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int, 2, [-13, 5, 42], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int, 2, [-13, 5, 42], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int8():
@@ -391,18 +396,21 @@ def test_return_list_int8():
     def simple_list_int8_no_args() -> list[np.int8]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int8_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int8_no_args, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
+
 
     @cudaq.kernel
     def simple_list_int8(n: int, t: list[np.int8]) -> list[np.int8]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int8, 2, [-13, 5, 42], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int8, 2, [-13, 5, 42], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int16():
@@ -411,18 +419,21 @@ def test_return_list_int16():
     def simple_list_int16_no_args() -> list[np.int16]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int16_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+
+    results = cudaq.run(simple_list_int16_no_args, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int16(n: int, t: list[np.int16]) -> list[np.int16]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int16, 2, [-13, 5, 42], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int16, 2, [-13, 5, 42], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int32():
@@ -431,18 +442,20 @@ def test_return_list_int32():
     def simple_list_int32_no_args() -> list[np.int32]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int32_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int32_no_args, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int32(n: int, t: list[np.int32]) -> list[np.int32]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int32, 2, [-13, 5, 42], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int32, 2, [-13, 5, 42], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int64():
@@ -451,18 +464,20 @@ def test_return_list_int64():
     def simple_list_int64_no_args() -> list[np.int64]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int64_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int64_no_args, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int64(n: int, t: list[np.int64]) -> list[np.int64]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int64, 2, [-13, 5, 42], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int64, 2, [-13, 5, 42], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_float():
@@ -471,18 +486,20 @@ def test_return_list_float():
     def simple_list_float_no_args() -> list[float]:
         return [-13.2, 5., 42.99]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_float_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_float_no_args, shots_count=2)
+    assert len(results) == 2
+    assert is_close_array(results[0], [-13.2, 5., 42.99])
+    assert is_close_array(results[1], [-13.2, 5., 42.99])
 
     @cudaq.kernel
     def simple_list_float(n: int, t: list[float]) -> list[float]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_float, 2, [-13.2, 5.0, 42.99], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_float, 2, [-13.2, 5.0, 42.99], shots_count=2)
+    assert len(results) == 2
+    assert is_close_array(results[0], [-13.2, 5., 42.99])
+    assert is_close_array(results[1], [-13.2, 5., 42.99])
 
 
 def test_return_list_float32():
@@ -491,18 +508,20 @@ def test_return_list_float32():
     def simple_list_float32_no_args() -> list[np.float32]:
         return [-13.2, 5., 42.99]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_float32_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_float32_no_args, shots_count=2)
+    assert len(results) == 2
+    assert is_close_array(results[0], [-13.2, 5., 42.99])
+    assert is_close_array(results[1], [-13.2, 5., 42.99])
 
     @cudaq.kernel
     def simple_list_float32(n: int, t: list[np.float32]) -> list[np.float32]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_float32, 2, [-13.2, 5.0, 42.99], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_float32, 2, [-13.2, 5.0, 42.99], shots_count=2)
+    assert len(results) == 2
+    assert is_close_array(results[0], [-13.2, 5., 42.99])
+    assert is_close_array(results[1], [-13.2, 5., 42.99])
 
 
 def test_return_list_float64():
@@ -511,19 +530,84 @@ def test_return_list_float64():
     def simple_list_float64_no_args() -> list[np.float64]:
         return [-13.2, 5., 42.99]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_float64_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_float64_no_args, shots_count=2)
+    assert len(results) == 2
+    assert is_close_array(results[0], [-13.2, 5., 42.99])
+    assert is_close_array(results[1], [-13.2, 5., 42.99])
 
     @cudaq.kernel
     def simple_list_float64(n: int, t: list[np.float64]) -> list[np.float64]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_float64, 2, [-13.2, 5.0, 42.99], shots_count=2)
-    assert list_err_msg in str(e.value)
+    
+    results = cudaq.run(simple_list_float64, 2, [-13.2, 5.0, 42.99], shots_count=2)
+    assert len(results) == 2
+    assert is_close_array(results[0], [-13.2, 5., 42.99])
+    assert is_close_array(results[1], [-13.2, 5., 42.99])
 
+def test_return_list_large_size():
+    # Returns a large list (dynamic size) to stress test the code generation
+    
+    @cudaq.kernel
+    def kernel_with_dynamic_int_array_input(n: int, t: list[int]) -> list[int]:
+        qubits = cudaq.qvector(n)
+        return t
+
+    @cudaq.kernel
+    def kernel_with_dynamic_float_array_input(n: int, t: list[float]) -> list[float]:
+        qubits = cudaq.qvector(n)
+        return t
+    
+    @cudaq.kernel
+    def kernel_with_dynamic_bool_array_input(n: int, t: list[bool]) -> list[bool]:
+        qubits = cudaq.qvector(n)
+        return t
+
+    # Test with various sizes (validate dynamic output logging)
+    for array_size in [10, 15, 100, 167, 1000]:
+        input_array = list(np.random.randint(-1000, 1000, size=array_size))
+        results = cudaq.run(kernel_with_dynamic_int_array_input, 2, input_array, shots_count=2)
+        assert len(results) == 2
+        assert results[0] == input_array
+        assert results[1] == input_array
+
+        input_array_float = list(np.random.uniform(-1000.0, 1000.0, size=array_size))
+        results = cudaq.run(kernel_with_dynamic_float_array_input, 2, input_array_float, shots_count=2)
+        assert len(results) == 2
+        assert is_close_array(results[0], input_array_float)
+        assert is_close_array(results[1], input_array_float)
+
+        input_array_bool = []
+        for _ in range(array_size):
+            input_array_bool.append(True if np.random.rand() > 0.5 else False)
+        results = cudaq.run(kernel_with_dynamic_bool_array_input, 2, input_array_bool, shots_count=2)
+        assert len(results) == 2
+        assert results[0] == input_array_bool
+        assert results[1] == input_array_bool
+
+def test_return_dynamics_measure_results():
+    @cudaq.kernel
+    def measure_all_qubits(numQubits: int) -> list[bool]:
+        # Number of qubits is dynamic
+        qubits = cudaq.qvector(numQubits)
+        for i in range(numQubits):
+            if i % 2 == 0:
+                x(qubits[i])
+
+        return mz(qubits)
+
+    for numQubits in [1, 3, 5, 11, 20]:
+        shots = 2
+        results = cudaq.run(measure_all_qubits, numQubits, shots_count=shots)
+        assert len(results) == shots
+        for res in results:
+            assert len(res) == numQubits
+            for i in range(numQubits):
+                if i % 2 == 0:
+                    assert res[i] == True
+                else:
+                    assert res[i] == False
 
 # Test tuples
 # TODO: Define spec for using tuples in kernels

From 03e5836d525b9eea65e57597ac2971f1a80b3d90 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 22 Oct 2025 06:06:43 +0000
Subject: [PATCH 6/8] Fix tests

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 lib/Optimizer/CodeGen/ReturnToOutputLog.cpp |  1 +
 python/tests/mlir/ast_list_comprehension.py | 44 ++++++++++++---------
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
index 09f280ff255..01665222154 100644
--- a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
+++ b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
@@ -162,6 +162,7 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
                 Value w = rewriter.create<cudaq::cc::LoadOp>(loc, v);
                 genOutputLog(loc, rewriter, w, offset);
               }
+              return;
             }
 
           // If we reach here and we cannot determine the constant size of the
diff --git a/python/tests/mlir/ast_list_comprehension.py b/python/tests/mlir/ast_list_comprehension.py
index ba3e936db4c..e0051bf28e7 100644
--- a/python/tests/mlir/ast_list_comprehension.py
+++ b/python/tests/mlir/ast_list_comprehension.py
@@ -55,10 +55,12 @@ def kernel3() -> float:
 
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel1() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"}
-# CHECK:            %[[VAL_0:.*]] = arith.constant true
-# CHECK:            %[[VAL_1:.*]] = cc.alloca !cc.array<i1 x 5>
-# CHECK:            %[[VAL_2:.*]] = cc.compute_ptr %[[VAL_1]][{{.*}}] : (!cc.ptr<!cc.array<i1 x 5>>, i64) -> !cc.ptr<i1>
-# CHECK:            cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i1>
+# CHECK:            %[[VAL_0:.*]] = arith.constant 1 : i8
+# CHECK:            %[[VAL_1:.*]] = cc.alloca !cc.array<i8 x 5>
+# CHECK:            %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr<!cc.array<i8 x 5>>) -> !cc.ptr<!cc.array<i8 x ?>>
+# CHECK:            %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_1]][{{.*}}] : (!cc.ptr<!cc.array<i8 x 5>>, i64) -> !cc.ptr<i8>
+# CHECK:            cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<i8>
+# CHECK:            %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_2]], %c5_i64 : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.stdvec<i1>
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel2() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK:            %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64
@@ -122,10 +124,12 @@ def kernel3() -> float:
 # CHECK:            %[[VAL_0:.*]] = arith.constant true
 # CHECK:            %[[VAL_1:.*]] = cc.alloca i1
 # CHECK:            cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<i1>
-# CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<i1 x 5>
-# CHECK:            %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i1>
-# CHECK:            %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<i1 x 5>>, i64) -> !cc.ptr<i1>
-# CHECK:            cc.store %[[VAL_3]], %[[VAL_4]] : !cc.ptr<i1>
+# CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<i8 x 5>
+# CHECK:            %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<i8 x 5>>) -> !cc.ptr<!cc.array<i8 x ?>>
+# CHECK:            %[[VAL_4:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i1>
+# CHECK:            %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<i8 x 5>>, i64) -> !cc.ptr<i8>
+# CHECK:            %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i8
+# CHECK:            cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr<i8>
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel2() -> i64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK:            %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64
@@ -198,10 +202,12 @@ def kernel3() -> float:
 # CHECK:            %[[VAL_0:.*]] = arith.constant true
 # CHECK:            %[[VAL_1:.*]] = cc.alloca i1
 # CHECK:            cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<i1>
-# CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<i1 x 5>
-# CHECK:            %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i1>
-# CHECK:            %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<i1 x 5>>, i64) -> !cc.ptr<i1>
-# CHECK:            cc.store %[[VAL_3]], %[[VAL_4]] : !cc.ptr<i1>
+# CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<i8 x 5>
+# CHECK:            %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<i8 x 5>>) -> !cc.ptr<!cc.array<i8 x ?>>
+# CHECK:            %[[VAL_4:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i1>
+# CHECK:            %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<i8 x 5>>, i64) -> !cc.ptr<i8>
+# CHECK:            %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i8
+# CHECK:            cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr<i8>
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel2() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK:            %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64
@@ -271,14 +277,14 @@ def kernel3() -> float:
 
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel1() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"}
-# CHECK:            %[[VAL_0:.*]] = arith.constant 1 : i64
-# CHECK:            %[[VAL_1:.*]] = arith.constant true
+# CHECK:            %[[VAL_0:.*]] = arith.constant 1 : i8
+# CHECK:            %[[VAL_1:.*]] = arith.constant 1 : i64
 # CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<!cc.stdvec<i1> x 5>
-# CHECK:            %[[VAL_3:.*]] = cc.alloca !cc.array<i1 x 1>
-# CHECK:            %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x 1>>) -> !cc.ptr<!cc.array<i1 x ?>>
-# CHECK:            %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x 1>>) -> !cc.ptr<i1>
-# CHECK:            cc.store %[[VAL_1]], %[[VAL_5]] : !cc.ptr<i1>
-# CHECK:            %[[VAL_6:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
+# CHECK:            %[[VAL_3:.*]] = cc.alloca !cc.array<i8 x 1>
+# CHECK:            %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i8 x 1>>) -> !cc.ptr<!cc.array<i8 x ?>>
+# CHECK:            %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i8 x 1>>) -> !cc.ptr<i8>
+# CHECK:            cc.store %[[VAL_0]], %[[VAL_5]] : !cc.ptr<i8>
+# CHECK:            %[[VAL_6:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_1]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.stdvec<i1>
 # CHECK:            %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<!cc.stdvec<i1> x 5>>, i64) -> !cc.ptr<!cc.stdvec<i1>>
 # CHECK:            cc.store %[[VAL_6]], %[[VAL_7]] : !cc.ptr<!cc.stdvec<i1>>
 

From da0bab6157271c8841972406c5c91382ee72d615 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 22 Oct 2025 06:42:12 +0000
Subject: [PATCH 7/8] Split interop support into a separate branch

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 python/cudaq/kernel/ast_bridge.py             | 35 ---------
 python/cudaq/kernel/kernel_decorator.py       | 67 ++++-------------
 python/cudaq/kernel/utils.py                  | 27 +------
 python/runtime/cudaq/algorithms/py_run.cpp    | 31 +++-----
 .../cudaq/platform/py_alt_launch_kernel.cpp   | 20 ++----
 .../tests/interop/quantum_lib/quantum_lib.cpp |  8 ---
 .../tests/interop/quantum_lib/quantum_lib.h   | 10 ---
 .../test_cpp_quantum_algorithm_module.cpp     | 71 -------------------
 python/tests/interop/test_interop.py          | 58 ---------------
 9 files changed, 32 insertions(+), 295 deletions(-)

diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 3415810819d..96f8ebc4f70 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -435,24 +435,6 @@ def changeOperandToType(self, ty, operand, allowDemotion=False):
                                  sint=operand_width != 1,
                                  zint=operand_width == 1).result
         
-        if quake.StruqType.isinstance(ty):
-            if quake.StruqType.isinstance(operand.type):
-                # Check that the struct types are the same, only the name may differ.
-                targetMemberType = quake.StruqType.getTypes(ty)
-                operandMemberType = quake.StruqType.getTypes(operand.type)
-                if len(targetMemberType) != len(operandMemberType):
-                    self.emitFatalError(
-                        f'cannot convert value of type {operand.type} to the requested type {ty}',
-                        self.currentNode)
-                for i in range(len(targetMemberType)):
-                    if targetMemberType[i] != operandMemberType[i]:
-                        self.emitFatalError(
-                            f'cannot convert value of type {operand.type} to the requested type {ty}',
-                            self.currentNode)
-                # It is the same struct, do a cast
-                structPtr = self.ifNotPointerThenStore(operand)
-                castedPtr =  cc.CastOp(cc.PointerType.get(ty), structPtr).result
-                return self.ifPointerThenLoad(castedPtr)
         
         self.emitFatalError(
             f'cannot convert value of type {operand.type} to the requested type {ty}',
@@ -2430,23 +2412,6 @@ def bodyBuilder(iterVal):
                     # kernel registry correctly for the next conditional check
                     if var.name in globalKernelRegistry:
                         node.func.id = var.name
-                # Check generic callable objects that may be C++ `qkernel` (with its MLIR code registered)
-                elif hasattr(var, '__call__'):
-                    # This is a callable object, which could be a C++ kernel
-                    # Get the full module + name key and see if it is registered
-                    modulePath = str(var.__module__) if hasattr(
-                        var, '__module__') else ''
-                    funcName = str(var.__name__) if hasattr(
-                        var, '__name__') else ''
-                    devKey = f"{modulePath}.{funcName}"
-                    if cudaq_runtime.isRegisteredDeviceModule(devKey):
-                        maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
-                            self.module, devKey)
-                        if maybeKernelName != None:
-                            otherKernel = SymbolTable(
-                                self.module.operation)[maybeKernelName]
-                            processFunctionCall(otherKernel.type, len(node.args))
-                            return
 
             if node.func.id in globalKernelRegistry:
                 # If in `globalKernelRegistry`, it has to be in this Module
diff --git a/python/cudaq/kernel/kernel_decorator.py b/python/cudaq/kernel/kernel_decorator.py
index f1a685468a3..799117a07bc 100644
--- a/python/cudaq/kernel/kernel_decorator.py
+++ b/python/cudaq/kernel/kernel_decorator.py
@@ -451,24 +451,6 @@ def __convertStringsToPauli__(self, arg):
 
         return arg
 
-    def getCallableNames(self, *args):
-        callableNames = []
-        for arg in args:
-            if isinstance(arg, PyKernelDecorator):
-                callableNames.append(arg.name)
-            else:
-                if hasattr(arg, '__call__') and hasattr(arg, '__module__') and hasattr(arg, '__name__'):
-                    # This is a callable object, likely a C++ kernel
-                    devKey = f"{arg.__module__}.{arg.__name__}"
-                    if cudaq_runtime.isRegisteredDeviceModule(devKey):                        
-                        maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
-                            self.module, devKey)
-                        if maybeKernelName != None:
-                            # Remove "__nvqpp__mlirgen__" prefix
-                            maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "")
-                            callableNames.append(maybeKernelName)
-        return callableNames
-    
     def __call__(self, *args):
         """
         Invoke the CUDA-Q kernel. JIT compilation of the kernel AST to MLIR 
@@ -499,8 +481,7 @@ def __call__(self, *args):
             mlirType = mlirTypeFromPyType(type(arg),
                                           self.module.context,
                                           argInstance=arg,
-                                          argTypeToCompareTo=self.argTypes[i],
-                                          module=self.module)
+                                          argTypeToCompareTo=self.argTypes[i])
 
             if self.isCastablePyType(mlirType, self.argTypes[i]):
                 processedArgs.append(
@@ -515,39 +496,19 @@ def __call__(self, *args):
                 )
 
             if cc.CallableType.isinstance(mlirType):
-                if isinstance(arg, PyKernelDecorator):
-                    # Assume this is a PyKernelDecorator
-                    callableNames.append(arg.name)
-                    # It may be that the provided input callable kernel
-                    # is not currently in the ModuleOp. Need to add it
-                    # if that is the case, we have to use the AST
-                    # so that it shares self.module's MLIR Context
-                    symbols = SymbolTable(self.module.operation)
-                    if nvqppPrefix + arg.name not in symbols:
-                        tmpBridge = PyASTBridge(self.capturedDataStorage,
-                                                existingModule=self.module,
-                                                disableEntryPointTag=True)
-                        tmpBridge.visit(globalAstRegistry[arg.name][0])
-                else:
-                    if hasattr(arg, '__call__') and hasattr(arg, '__module__') and hasattr(arg, '__name__'):
-                        # This is a callable object, likely a C++ kernel
-                        devKey = f"{arg.__module__}.{arg.__name__}"
-                        if cudaq_runtime.isRegisteredDeviceModule(devKey):
-                            print("111Found registered device module for callable object:", devKey)
-                            
-                            maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
-                                self.module, devKey)
-                            if maybeKernelName != None:
-                                otherKernel = SymbolTable(
-                                    self.module.operation)[maybeKernelName]
-                                print("Found Other kernel:", otherKernel)
-                                # Remove "__nvqpp__mlirgen__" prefix
-                                maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "")
-                                callableNames.append(maybeKernelName)
-                    else:
-                        emitFatalError(
-                            "Invalid callable argument provided to kernel."
-                        )
+                # Assume this is a PyKernelDecorator
+                callableNames.append(arg.name)
+                # It may be that the provided input callable kernel
+                # is not currently in the ModuleOp. Need to add it
+                # if that is the case, we have to use the AST
+                # so that it shares self.module's MLIR Context
+                symbols = SymbolTable(self.module.operation)
+                if nvqppPrefix + arg.name not in symbols:
+                    tmpBridge = PyASTBridge(self.capturedDataStorage,
+                                            existingModule=self.module,
+                                            disableEntryPointTag=True)
+                    tmpBridge.visit(globalAstRegistry[arg.name][0])
+
             # Convert `numpy` arrays to lists
             if cc.StdvecType.isinstance(mlirType) and hasattr(arg, "tolist"):
                 if arg.ndim != 1:
diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py
index 8c0b62327e3..efaf213b581 100644
--- a/python/cudaq/kernel/utils.py
+++ b/python/cudaq/kernel/utils.py
@@ -15,8 +15,8 @@
 import types
 
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
-from cudaq.mlir.dialects import quake, cc, func
-from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType, SymbolTable
+from cudaq.mlir.dialects import quake, cc
+from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType
 
 State = cudaq_runtime.State
 qvector = cudaq_runtime.qvector
@@ -442,28 +442,7 @@ def mlirTypeFromPyType(argType, ctx, **kwargs):
     if 'argInstance' in kwargs:
         argInstance = kwargs['argInstance']
         if isinstance(argInstance, Callable):
-            if hasattr(argInstance, 'argTypes'):
-                return cc.CallableType.get(argInstance.argTypes, ctx)
-            elif hasattr(argInstance, '__call__') and hasattr(argInstance, '__module__') and hasattr(argInstance, '__name__'):
-                # This is a callable object, likely a C++ kernel
-                devKey = f"{argInstance.__module__}.{argInstance.__name__}"
-                if cudaq_runtime.isRegisteredDeviceModule(devKey):
-                    if "module" in kwargs:
-                        module = kwargs['module']
-                        maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(
-                            module, devKey)
-                        if maybeKernelName != None:
-                            otherKernel = SymbolTable(
-                                module.operation)[maybeKernelName]
-                            if isinstance(otherKernel, func.FuncOp):
-                                argTypes = []
-                                for arg in otherKernel.arguments:
-                                    argTypes.append(arg.type)
-                                return cc.CallableType.get(argTypes, ctx)
-                            else:
-                                emitFatalError(
-                                    f"Registered C++ kernel '{maybeKernelName}' is not of CallableType."
-                                )
+            return cc.CallableType.get(argInstance.argTypes, ctx)
 
     for name in globalRegisteredTypes.classes:
         customTy, memberTys = globalRegisteredTypes.getClassAttributes(name)
diff --git a/python/runtime/cudaq/algorithms/py_run.cpp b/python/runtime/cudaq/algorithms/py_run.cpp
index ef59b14f461..665836234df 100644
--- a/python/runtime/cudaq/algorithms/py_run.cpp
+++ b/python/runtime/cudaq/algorithms/py_run.cpp
@@ -39,8 +39,7 @@ static std::vector<py::object> readRunResults(mlir::ModuleOp module,
 }
 
 static std::tuple<std::string, MlirModule, OpaqueArguments *,
-                  mlir::func::FuncOp, std::string, mlir::func::FuncOp,
-                  std::vector<std::string>>
+                  mlir::func::FuncOp, std::string, mlir::func::FuncOp>
 getKernelLaunchParameters(py::object &kernel, py::args args) {
   if (!py::hasattr(kernel, "arguments"))
     throw std::runtime_error(
@@ -53,11 +52,6 @@ getKernelLaunchParameters(py::object &kernel, py::args args) {
   if (py::hasattr(kernel, "compile"))
     kernel.attr("compile")();
 
-  std::vector<std::string> callableNames;
-  if (py::hasattr(kernel, "getCallableNames"))
-    callableNames =
-        kernel.attr("getCallableNames")(*args).cast<std::vector<std::string>>();
-
   auto origKernName = kernel.attr("name").cast<std::string>();
   auto kernelName = origKernName + ".run";
   if (!py::hasattr(kernel, "module") || kernel.attr("module").is_none())
@@ -84,8 +78,7 @@ getKernelLaunchParameters(py::object &kernel, py::args args) {
   }
   auto *argData = toOpaqueArgs(args, kernelMod, kernelName);
   auto funcOp = getKernelFuncOp(kernelMod, kernelName);
-  return {kernelName,   kernelMod, argData,      funcOp,
-          origKernName, origKern,  callableNames};
+  return {kernelName, kernelMod, argData, funcOp, origKernName, origKern};
 }
 
 static details::RunResultSpan
@@ -93,7 +86,6 @@ pyRunTheKernel(const std::string &name, const std::string &origName,
                MlirModule module, mlir::func::FuncOp funcOp,
                mlir::func::FuncOp origKernel, OpaqueArguments &runtimeArgs,
                quantum_platform &platform, std::size_t shots_count,
-               const std::vector<std::string> &callableNames,
                std::size_t qpu_id = 0) {
   auto returnTypes = origKernel.getResultTypes();
   if (returnTypes.empty() || returnTypes.size() > 1)
@@ -112,13 +104,13 @@ pyRunTheKernel(const std::string &name, const std::string &origName,
 
   auto mod = unwrap(module);
 
-  auto [rawArgs, size, returnOffset, thunk] = pyAltLaunchKernelBase(
-      name, module, returnTy, runtimeArgs, callableNames, 0, false);
+  auto [rawArgs, size, returnOffset, thunk] =
+      pyAltLaunchKernelBase(name, module, returnTy, runtimeArgs, {}, 0, false);
 
   auto results = details::runTheKernel(
       [&]() mutable {
         pyLaunchKernel(name, thunk, mod, runtimeArgs, rawArgs, size,
-                       returnOffset, callableNames);
+                       returnOffset, {});
       },
       platform, name, origName, shots_count, qpu_id);
 
@@ -144,7 +136,7 @@ std::vector<py::object> pyRun(py::object &kernel, py::args args,
   if (shots_count == 0)
     return {};
 
-  auto [name, module, argData, func, origName, origKern, callableNames] =
+  auto [name, module, argData, func, origName, origKern] =
       getKernelLaunchParameters(kernel, args);
 
   auto mod = unwrap(module);
@@ -160,7 +152,7 @@ std::vector<py::object> pyRun(py::object &kernel, py::args args,
   }
 
   auto span = pyRunTheKernel(name, origName, module, func, origKern, *argData,
-                             platform, shots_count, callableNames);
+                             platform, shots_count);
   delete argData;
   auto results = pyReadResults(span, module, func, origKern, shots_count);
 
@@ -195,7 +187,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args,
                              ") exceeds the number of available QPUs (" +
                              std::to_string(numQPUs) + ")");
 
-  auto [name, module, argData, func, origName, origKern, callableNames] =
+  auto [name, module, argData, func, origName, origKern] =
       getKernelLaunchParameters(kernel, args);
 
   auto mod = unwrap(module);
@@ -230,7 +222,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args,
     QuantumTask wrapped = detail::make_copyable_function(
         [sp = std::move(spanPromise), ep = std::move(errorPromise), shots_count,
          qpu_id, argData, name, module, func, origKern, origName,
-         noise_model = std::move(noise_model), callableNames]() mutable {
+         noise_model = std::move(noise_model)]() mutable {
           auto &platform = get_platform();
 
           // Launch the kernel in the appropriate context.
@@ -238,9 +230,8 @@ async_run_result pyRunAsync(py::object &kernel, py::args args,
             platform.set_noise(&noise_model.value());
 
           try {
-            auto span =
-                pyRunTheKernel(name, origName, module, func, origKern, *argData,
-                               platform, shots_count, callableNames, qpu_id);
+            auto span = pyRunTheKernel(name, origName, module, func, origKern,
+                                       *argData, platform, shots_count, qpu_id);
             delete argData;
             sp.set_value(span);
             ep.set_value("");
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 5543a54b37d..9db3e9e431f 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -117,21 +117,8 @@ OpaqueArguments *toOpaqueArgs(py::args &args, MlirModule mod,
   auto *argData = new cudaq::OpaqueArguments();
   args = simplifiedValidateInputArguments(args);
   setDataLayout(mod);
-  auto callableArgHandler = [](cudaq::OpaqueArguments &argData,
-                               py::object &arg) {
-    if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) {
-      // Just give it some dummy data that will not be used.
-      // We synthesize away all callables, the block argument
-      // remains but it is not used, so just give argsCreator
-      // something, and we'll make sure its cleaned up.
-      long *ourAllocatedArg = new long();
-      argData.emplace_back(ourAllocatedArg,
-                           [](void *ptr) { delete static_cast<long *>(ptr); });
-      return true;
-    }
-    return false;
-  };
-  cudaq::packArgs(*argData, args, kernelFunc, callableArgHandler);
+  cudaq::packArgs(*argData, args, kernelFunc,
+                  [](OpaqueArguments &, py::object &) { return false; });
   return argData;
 }
 
@@ -170,6 +157,7 @@ ExecutionEngine *jitKernel(const std::string &name, MlirModule module,
     pm.addPass(cudaq::opt::createGenerateKernelExecution(
         {.startingArgIdx = startingArgIdx}));
     pm.addPass(cudaq::opt::createGenerateDeviceCodeLoader({.jitTime = true}));
+    pm.addPass(cudaq::opt::createReturnToOutputLog());
     pm.addPass(cudaq::opt::createLambdaLiftingPass());
     pm.addPass(cudaq::opt::createDistributedDeviceCall());
     std::string tl = getTransportLayer();
@@ -959,7 +947,7 @@ void bindAltLaunchKernel(py::module &mod,
 
   auto callableArgHandler = [](cudaq::OpaqueArguments &argData,
                                py::object &arg) {
-    if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) {
+    if (py::hasattr(arg, "module")) {
       // Just give it some dummy data that will not be used.
       // We synthesize away all callables, the block argument
       // remains but it is not used, so just give argsCreator
diff --git a/python/tests/interop/quantum_lib/quantum_lib.cpp b/python/tests/interop/quantum_lib/quantum_lib.cpp
index ed313b32f88..7c5cbb23054 100644
--- a/python/tests/interop/quantum_lib/quantum_lib.cpp
+++ b/python/tests/interop/quantum_lib/quantum_lib.cpp
@@ -29,12 +29,4 @@ __qpu__ void qft(cudaq::qview<> qubits, const std::vector<double> &x,
 __qpu__ void another(cudaq::qview<> qubits, std::size_t i) { x(qubits[i]); }
 
 __qpu__ void uccsd(cudaq::qview<> qubits, std::size_t) { h(qubits[0]); }
-
-__qpu__ void reset_group(patch p) {
-  for (std::size_t i = 0; i < p.data.size(); i++)
-    reset(p.data[i]);
-}
-
-__qpu__ void x_group(patch p) { x(p.data); }
-
 } // namespace cudaq
diff --git a/python/tests/interop/quantum_lib/quantum_lib.h b/python/tests/interop/quantum_lib/quantum_lib.h
index 4b9fa371351..a0655099237 100644
--- a/python/tests/interop/quantum_lib/quantum_lib.h
+++ b/python/tests/interop/quantum_lib/quantum_lib.h
@@ -9,12 +9,6 @@
 
 #include "cudaq/qis/qubit_qis.h"
 
-// Custom data structure
-struct patch {
-  cudaq::qview<> data;
-  cudaq::qview<> aux;
-};
-
 namespace cudaq {
 void entryPoint(const std::function<void(cudaq::qvector<> &)> &statePrep);
 
@@ -25,8 +19,4 @@ void another(cudaq::qview<> qubits, std::size_t);
 
 void uccsd(cudaq::qview<> qubits, std::size_t);
 
-void reset_group(patch p);
-
-void x_group(patch p);
-
 } // namespace cudaq
diff --git a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
index 9d0b54bfa57..4ea2d2176cc 100644
--- a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
+++ b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
@@ -8,7 +8,6 @@
 
 #include "cudaq.h"
 #include "cudaq/algorithms/sample.h"
-#include "cudaq/qis/qkernel.h"
 #include "quantum_lib/quantum_lib.h"
 #include "runtime/interop/PythonCppInterop.h"
 #include <pybind11/pybind11.h>
@@ -16,22 +15,6 @@
 
 namespace py = pybind11;
 
-namespace {
-static std::unordered_map<std::string,
-                          cudaq::qkernel<void(cudaq::qview<>, std::size_t)>>
-    g_cppKernels_1;
-
-static std::unordered_map<std::string, cudaq::qkernel<void(patch)>>
-    g_cppKernels_2;
-
-static const bool initKernels = []() {
-  g_cppKernels_1.insert(std::make_pair("uccsd", cudaq::uccsd));
-  g_cppKernels_2.insert(std::make_pair("reset", cudaq::reset_group));
-  g_cppKernels_2.insert(std::make_pair("x", cudaq::x_group));
-  return true;
-}();
-} // namespace
-
 PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
 
   m.def("test_cpp_qalgo", [](py::object statePrepIn) {
@@ -66,58 +49,4 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
 
   cudaq::python::addDeviceKernelInterop<cudaq::qview<>, std::size_t>(
       m, "qstd", "uccsd", "");
-
-  // Convert the C++ kernel registry to Python-accessible kernels
-  auto interopSubMod = m.def_submodule("_cpp_interop_kernels");
-  static std::unordered_map<std::string, py::object> g_py_kernels;
-
-  for (auto &[name, kernel] : g_cppKernels_1) {
-    const char *qkernelName = cudaq::registry::getLinkableKernelNameOrNull(
-        cudaq::registry::__cudaq_getLinkableKernelKey(&kernel));
-    if (!qkernelName) {
-      throw std::runtime_error("Could not find registered kernel name for " +
-                               name);
-    }
-
-    std::string kernelName = qkernelName;
-    if (kernelName.starts_with("function_"))
-      kernelName = kernelName.substr(std::string("function_").length());
-
-    interopSubMod.def(
-        kernelName.c_str(), [](py::object qview, std::size_t i) {},
-        "Auto-generated one-qubit encoding kernel from C++ code");
-    cudaq::python::registerDeviceKernel(
-        interopSubMod.attr("__name__").cast<std::string>(), kernelName, "");
-    g_py_kernels.insert(
-        std::make_pair(name, interopSubMod.attr(kernelName.c_str())));
-  }
-
-  for (auto &[name, kernel] : g_cppKernels_2) {
-    const char *qkernelName = cudaq::registry::getLinkableKernelNameOrNull(
-        cudaq::registry::__cudaq_getLinkableKernelKey(&kernel));
-    if (!qkernelName) {
-      throw std::runtime_error("Could not find registered kernel name for " +
-                               name);
-    }
-
-    std::string kernelName = qkernelName;
-    if (kernelName.starts_with("function_"))
-      kernelName = kernelName.substr(std::string("function_").length());
-
-    interopSubMod.def(
-        kernelName.c_str(), [](py::object patch) {},
-        "Auto-generated one-qubit encoding kernel from C++ code");
-    cudaq::python::registerDeviceKernel(
-        interopSubMod.attr("__name__").cast<std::string>(), kernelName, "");
-    g_py_kernels.insert(
-        std::make_pair(name, interopSubMod.attr(kernelName.c_str())));
-  }
-
-  m.def("get_cpp_kernel", [](const std::string &name) {
-    auto it = g_py_kernels.find(name);
-    if (it == g_py_kernels.end())
-      throw std::runtime_error("No C++ kernel registered for requested name.");
-
-    return it->second;
-  });
 }
diff --git a/python/tests/interop/test_interop.py b/python/tests/interop/test_interop.py
index 7e1e7e05fe9..e63588408ac 100644
--- a/python/tests/interop/test_interop.py
+++ b/python/tests/interop/test_interop.py
@@ -7,8 +7,6 @@
 # ============================================================================ #
 
 import cudaq, pytest
-from typing import Callable
-from dataclasses import dataclass
 
 cudaq_test_cpp_algo = pytest.importorskip('cudaq_test_cpp_algo')
 
@@ -244,59 +242,3 @@ def entry():
         takesCapture(spin)
 
     entry.compile()
-
-def test_cpp_qkernel():
-  # Test the `qkernel` provided in C++ via a map-like registry.
-  # This is provided as a function-like callable.
-  kernel_from_cpp_registry = cudaq_test_cpp_algo.get_cpp_kernel("uccsd")
-  
-  # Use as a capture
-  @cudaq.kernel
-  def cpp_qkernel():
-    q = cudaq.qvector(4)
-    kernel_from_cpp_registry(q, 0)
-
-  cpp_qkernel()
-
-
-  # Use as a callable argument
-  @cudaq.kernel
-  def caller(k: Callable[[cudaq.qview, int], None]):
-    q = cudaq.qvector(4)
-    k(q, 0)
-
-  caller(kernel_from_cpp_registry)
-
-
-def test_cpp_custom_struct(): 
-  # Define a struct in Python that matches the C++ struct
-  @dataclass(slots=True)
-  class patch:
-    data: cudaq.qvector
-    aux: cudaq.qvector
-  
-  reset_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("reset")
-  x_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("x")
-
-  # Use as a capture
-  @cudaq.kernel
-  def cpp_qkernel_struct():
-    q = cudaq.qvector(4)
-    r = cudaq.qvector(2)
-    x(q)
-    reset_qkernel(patch(q, r))
-
-  counts = cudaq.sample(cpp_qkernel_struct)
-  counts.dump()
-  assert len(counts) == 1 and '000000' in counts
-
-  @cudaq.kernel
-  def cpp_qkernel_struct_x():
-    q = cudaq.qvector(4)
-    r = cudaq.qvector(2)
-    x_qkernel(patch(q, r))
-
-  counts = cudaq.sample(cpp_qkernel_struct_x)
-  counts.dump()
-  assert len(counts) == 1 and '111100' in counts
-

From 2ba98ca1d485d3bad9d4cb052e3079ab00698d67 Mon Sep 17 00:00:00 2001
From: Thien Nguyen <thiennguyen@nvidia.com>
Date: Wed, 22 Oct 2025 06:46:53 +0000
Subject: [PATCH 8/8] Support for qkernel interop

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>

Tidy up

Signed-off-by: Thien Nguyen <thiennguyen@nvidia.com>
---
 python/cudaq/kernel/ast_bridge.py             | 16 +++-
 python/cudaq/kernel/kernel_decorator.py       | 56 +++++++++----
 python/cudaq/kernel/utils.py                  | 48 ++++++++++--
 python/runtime/cudaq/algorithms/py_run.cpp    | 31 +++++---
 .../cudaq/platform/py_alt_launch_kernel.cpp   | 20 ++++-
 python/runtime/interop/PythonCppInterop.h     | 26 +++++++
 .../tests/interop/quantum_lib/quantum_lib.cpp |  8 ++
 .../tests/interop/quantum_lib/quantum_lib.h   | 10 +++
 .../test_cpp_quantum_algorithm_module.cpp     | 39 ++++++++++
 python/tests/interop/test_interop.py          | 78 +++++++++++++++++++
 10 files changed, 293 insertions(+), 39 deletions(-)

diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 96f8ebc4f70..9069b4bd168 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -31,7 +31,7 @@
 from .utils import (Color, globalAstRegistry, globalKernelRegistry,
                     globalRegisteredOperations, globalRegisteredTypes,
                     nvqppPrefix, mlirTypeFromAnnotation, mlirTypeFromPyType,
-                    mlirTypeToPyType, mlirTryCreateStructType)
+                    mlirTypeToPyType, mlirTryCreateStructType, getInteropKernelNameIfFound)
 
 State = cudaq_runtime.State
 
@@ -435,7 +435,6 @@ def changeOperandToType(self, ty, operand, allowDemotion=False):
                                  sint=operand_width != 1,
                                  zint=operand_width == 1).result
         
-        
         self.emitFatalError(
             f'cannot convert value of type {operand.type} to the requested type {ty}',
             self.currentNode)
@@ -2412,6 +2411,15 @@ def bodyBuilder(iterVal):
                     # kernel registry correctly for the next conditional check
                     if var.name in globalKernelRegistry:
                         node.func.id = var.name
+                # Check generic callable objects that may be C++ `qkernel` (with its MLIR code registered)
+                elif hasattr(var, '__call__'):
+                    # Check if this is a registered C++ kernel 
+                    maybeKernelName = getInteropKernelNameIfFound(var, self.module)
+                    if maybeKernelName != None:
+                        otherKernel = SymbolTable(
+                            self.module.operation)[maybeKernelName]
+                        processFunctionCall(otherKernel.type, len(node.args))
+                        return
 
             if node.func.id in globalKernelRegistry:
                 # If in `globalKernelRegistry`, it has to be in this Module
@@ -2493,8 +2501,10 @@ def bodyBuilder(iterVal):
                     for _, v in annotations.items()
                 ]
 
+                unnamed_struct = "__repr__" not in cls.__dict__
+                struct_name = node.func.id if not unnamed_struct else ""
                 structTy = mlirTryCreateStructType(structTys,
-                                                   name=node.func.id,
+                                                   name=struct_name,
                                                    context=self.ctx)
                 if structTy is None:
                     self.emitFatalError(
diff --git a/python/cudaq/kernel/kernel_decorator.py b/python/cudaq/kernel/kernel_decorator.py
index 799117a07bc..ad571d89393 100644
--- a/python/cudaq/kernel/kernel_decorator.py
+++ b/python/cudaq/kernel/kernel_decorator.py
@@ -21,7 +21,7 @@
 from .captured_data import CapturedDataStorage
 from .utils import (emitFatalError, emitErrorIfInvalidPauli, globalAstRegistry,
                     globalRegisteredTypes, mlirTypeFromPyType, mlirTypeToPyType,
-                    nvqppPrefix)
+                    nvqppPrefix, getInteropKernelNameIfFound)
 
 # This file implements the decorator mechanism needed to
 # JIT compile CUDA-Q kernels. It exposes the cudaq.kernel()
@@ -451,6 +451,20 @@ def __convertStringsToPauli__(self, arg):
 
         return arg
 
+    def getCallableNames(self, *args):
+        callableNames = []
+        for arg in args:
+            if isinstance(arg, PyKernelDecorator):
+                callableNames.append(arg.name)
+            else:
+                if hasattr(arg, '__call__'):
+                    maybeKernelName = getInteropKernelNameIfFound(arg, self.module)
+                    if maybeKernelName != None:
+                        # Remove "__nvqpp__mlirgen__" prefix when packing the list of callables
+                        maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "")
+                        callableNames.append(maybeKernelName)
+        return callableNames
+    
     def __call__(self, *args):
         """
         Invoke the CUDA-Q kernel. JIT compilation of the kernel AST to MLIR 
@@ -481,7 +495,8 @@ def __call__(self, *args):
             mlirType = mlirTypeFromPyType(type(arg),
                                           self.module.context,
                                           argInstance=arg,
-                                          argTypeToCompareTo=self.argTypes[i])
+                                          argTypeToCompareTo=self.argTypes[i],
+                                          module=self.module)
 
             if self.isCastablePyType(mlirType, self.argTypes[i]):
                 processedArgs.append(
@@ -496,19 +511,30 @@ def __call__(self, *args):
                 )
 
             if cc.CallableType.isinstance(mlirType):
-                # Assume this is a PyKernelDecorator
-                callableNames.append(arg.name)
-                # It may be that the provided input callable kernel
-                # is not currently in the ModuleOp. Need to add it
-                # if that is the case, we have to use the AST
-                # so that it shares self.module's MLIR Context
-                symbols = SymbolTable(self.module.operation)
-                if nvqppPrefix + arg.name not in symbols:
-                    tmpBridge = PyASTBridge(self.capturedDataStorage,
-                                            existingModule=self.module,
-                                            disableEntryPointTag=True)
-                    tmpBridge.visit(globalAstRegistry[arg.name][0])
-
+                if isinstance(arg, PyKernelDecorator):
+                    # Assume this is a PyKernelDecorator
+                    callableNames.append(arg.name)
+                    # It may be that the provided input callable kernel
+                    # is not currently in the ModuleOp. Need to add it
+                    # if that is the case, we have to use the AST
+                    # so that it shares self.module's MLIR Context
+                    symbols = SymbolTable(self.module.operation)
+                    if nvqppPrefix + arg.name not in symbols:
+                        tmpBridge = PyASTBridge(self.capturedDataStorage,
+                                                existingModule=self.module,
+                                                disableEntryPointTag=True)
+                        tmpBridge.visit(globalAstRegistry[arg.name][0])
+                else:
+                    if hasattr(arg, '__call__'):
+                        maybeKernelName = getInteropKernelNameIfFound(arg, self.module)
+                        if maybeKernelName != None:   
+                            # Remove "__nvqpp__mlirgen__" prefix
+                            maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "")
+                            callableNames.append(maybeKernelName)
+                    else:
+                        emitFatalError(
+                            "Invalid callable argument provided to kernel."
+                        )
             # Convert `numpy` arrays to lists
             if cc.StdvecType.isinstance(mlirType) and hasattr(arg, "tolist"):
                 if arg.ndim != 1:
diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py
index efaf213b581..e7f447be516 100644
--- a/python/cudaq/kernel/utils.py
+++ b/python/cudaq/kernel/utils.py
@@ -15,8 +15,8 @@
 import types
 
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
-from cudaq.mlir.dialects import quake, cc
-from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType
+from cudaq.mlir.dialects import quake, cc, func
+from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType, SymbolTable
 
 State = cudaq_runtime.State
 qvector = cudaq_runtime.qvector
@@ -119,8 +119,9 @@ def isQuantumType(ty):
     if numQuantumMembers != len(mlirEleTypes) or \
         any((quake.StruqType.isinstance(t) for t in mlirEleTypes)):
         return None
-    return quake.StruqType.getNamed(name, mlirEleTypes, context=context)
-
+    if len(name) > 0:
+        return quake.StruqType.getNamed(name, mlirEleTypes, context=context)
+    return quake.StruqType.get(mlirEleTypes, context=context)
 
 def mlirTypeFromAnnotation(annotation, ctx, raiseError=False):
     """
@@ -284,6 +285,7 @@ def emitFatalErrorOverride(msg):
                     f"Adding new fields in data classes is not yet supported. The dataclass must be declared with @dataclass(slots=True) or @dataclasses.dataclass(slots=True)."
                 )
 
+            unnamed_struct = "__repr__" not in pyType.__dict__
             if len({
                     k: v
                     for k, v in pyType.__dict__.items()
@@ -293,7 +295,8 @@ def emitFatalErrorOverride(msg):
                 localEmitFatalError(
                     'struct types with user specified methods are not allowed.')
 
-            tupleTy = mlirTryCreateStructType(structTys, name=id)
+            struct_name = id if not unnamed_struct else ""
+            tupleTy = mlirTryCreateStructType(structTys, name=struct_name) 
             if tupleTy is None:
                 localEmitFatalError(
                     "Hybrid quantum-classical data types and nested quantum structs are not allowed."
@@ -442,7 +445,19 @@ def mlirTypeFromPyType(argType, ctx, **kwargs):
     if 'argInstance' in kwargs:
         argInstance = kwargs['argInstance']
         if isinstance(argInstance, Callable):
-            return cc.CallableType.get(argInstance.argTypes, ctx)
+            if hasattr(argInstance, 'argTypes'):
+                return cc.CallableType.get(argInstance.argTypes, ctx)
+            elif "module" in kwargs and hasattr(argInstance, '__call__'):
+                # This is a callable object, check if it's a C++ `qkernel`
+                maybeKernelName = getInteropKernelNameIfFound(argInstance, kwargs['module'])
+                if maybeKernelName != None:
+                    otherKernel = SymbolTable(
+                        kwargs['module'].operation)[maybeKernelName]
+                    if isinstance(otherKernel, func.FuncOp):
+                        argTypes = []
+                        for arg in otherKernel.arguments:
+                            argTypes.append(arg.type)
+                        return cc.CallableType.get(argTypes, ctx)
 
     for name in globalRegisteredTypes.classes:
         customTy, memberTys = globalRegisteredTypes.getClassAttributes(name)
@@ -557,6 +572,27 @@ def mlirTypeToPyType(argType):
     emitFatalError(
         f"Cannot infer python type from provided CUDA-Q type ({argType})")
 
+def getInteropKernelNameIfFound(pyFunc, module):
+    """
+    Given a Python function and an MLIR module, check if the function
+    is registered as an interop kernel. If so, return the kernel name.
+    Otherwise, return None.
+    """
+    if not callable(pyFunc):
+        emitFatalError(
+            f"Provided argument is not a callable function ({pyFunc})"
+        )
+
+    modulePath = str(pyFunc.__module__) if hasattr(pyFunc, '__module__') else ''
+    funcName = str(pyFunc.__name__) if hasattr(pyFunc, '__name__') else ''
+    # Look up key
+    devKey = f"{modulePath}.{funcName}"
+    if cudaq_runtime.isRegisteredDeviceModule(devKey):
+        maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(module, devKey)
+        if maybeKernelName != None:
+            return maybeKernelName
+
+    return None
 
 def emitErrorIfInvalidPauli(pauliArg):
     """
diff --git a/python/runtime/cudaq/algorithms/py_run.cpp b/python/runtime/cudaq/algorithms/py_run.cpp
index 665836234df..ef59b14f461 100644
--- a/python/runtime/cudaq/algorithms/py_run.cpp
+++ b/python/runtime/cudaq/algorithms/py_run.cpp
@@ -39,7 +39,8 @@ static std::vector<py::object> readRunResults(mlir::ModuleOp module,
 }
 
 static std::tuple<std::string, MlirModule, OpaqueArguments *,
-                  mlir::func::FuncOp, std::string, mlir::func::FuncOp>
+                  mlir::func::FuncOp, std::string, mlir::func::FuncOp,
+                  std::vector<std::string>>
 getKernelLaunchParameters(py::object &kernel, py::args args) {
   if (!py::hasattr(kernel, "arguments"))
     throw std::runtime_error(
@@ -52,6 +53,11 @@ getKernelLaunchParameters(py::object &kernel, py::args args) {
   if (py::hasattr(kernel, "compile"))
     kernel.attr("compile")();
 
+  std::vector<std::string> callableNames;
+  if (py::hasattr(kernel, "getCallableNames"))
+    callableNames =
+        kernel.attr("getCallableNames")(*args).cast<std::vector<std::string>>();
+
   auto origKernName = kernel.attr("name").cast<std::string>();
   auto kernelName = origKernName + ".run";
   if (!py::hasattr(kernel, "module") || kernel.attr("module").is_none())
@@ -78,7 +84,8 @@ getKernelLaunchParameters(py::object &kernel, py::args args) {
   }
   auto *argData = toOpaqueArgs(args, kernelMod, kernelName);
   auto funcOp = getKernelFuncOp(kernelMod, kernelName);
-  return {kernelName, kernelMod, argData, funcOp, origKernName, origKern};
+  return {kernelName,   kernelMod, argData,      funcOp,
+          origKernName, origKern,  callableNames};
 }
 
 static details::RunResultSpan
@@ -86,6 +93,7 @@ pyRunTheKernel(const std::string &name, const std::string &origName,
                MlirModule module, mlir::func::FuncOp funcOp,
                mlir::func::FuncOp origKernel, OpaqueArguments &runtimeArgs,
                quantum_platform &platform, std::size_t shots_count,
+               const std::vector<std::string> &callableNames,
                std::size_t qpu_id = 0) {
   auto returnTypes = origKernel.getResultTypes();
   if (returnTypes.empty() || returnTypes.size() > 1)
@@ -104,13 +112,13 @@ pyRunTheKernel(const std::string &name, const std::string &origName,
 
   auto mod = unwrap(module);
 
-  auto [rawArgs, size, returnOffset, thunk] =
-      pyAltLaunchKernelBase(name, module, returnTy, runtimeArgs, {}, 0, false);
+  auto [rawArgs, size, returnOffset, thunk] = pyAltLaunchKernelBase(
+      name, module, returnTy, runtimeArgs, callableNames, 0, false);
 
   auto results = details::runTheKernel(
       [&]() mutable {
         pyLaunchKernel(name, thunk, mod, runtimeArgs, rawArgs, size,
-                       returnOffset, {});
+                       returnOffset, callableNames);
       },
       platform, name, origName, shots_count, qpu_id);
 
@@ -136,7 +144,7 @@ std::vector<py::object> pyRun(py::object &kernel, py::args args,
   if (shots_count == 0)
     return {};
 
-  auto [name, module, argData, func, origName, origKern] =
+  auto [name, module, argData, func, origName, origKern, callableNames] =
       getKernelLaunchParameters(kernel, args);
 
   auto mod = unwrap(module);
@@ -152,7 +160,7 @@ std::vector<py::object> pyRun(py::object &kernel, py::args args,
   }
 
   auto span = pyRunTheKernel(name, origName, module, func, origKern, *argData,
-                             platform, shots_count);
+                             platform, shots_count, callableNames);
   delete argData;
   auto results = pyReadResults(span, module, func, origKern, shots_count);
 
@@ -187,7 +195,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args,
                              ") exceeds the number of available QPUs (" +
                              std::to_string(numQPUs) + ")");
 
-  auto [name, module, argData, func, origName, origKern] =
+  auto [name, module, argData, func, origName, origKern, callableNames] =
       getKernelLaunchParameters(kernel, args);
 
   auto mod = unwrap(module);
@@ -222,7 +230,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args,
     QuantumTask wrapped = detail::make_copyable_function(
         [sp = std::move(spanPromise), ep = std::move(errorPromise), shots_count,
          qpu_id, argData, name, module, func, origKern, origName,
-         noise_model = std::move(noise_model)]() mutable {
+         noise_model = std::move(noise_model), callableNames]() mutable {
           auto &platform = get_platform();
 
           // Launch the kernel in the appropriate context.
@@ -230,8 +238,9 @@ async_run_result pyRunAsync(py::object &kernel, py::args args,
             platform.set_noise(&noise_model.value());
 
           try {
-            auto span = pyRunTheKernel(name, origName, module, func, origKern,
-                                       *argData, platform, shots_count, qpu_id);
+            auto span =
+                pyRunTheKernel(name, origName, module, func, origKern, *argData,
+                               platform, shots_count, callableNames, qpu_id);
             delete argData;
             sp.set_value(span);
             ep.set_value("");
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 9db3e9e431f..5543a54b37d 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -117,8 +117,21 @@ OpaqueArguments *toOpaqueArgs(py::args &args, MlirModule mod,
   auto *argData = new cudaq::OpaqueArguments();
   args = simplifiedValidateInputArguments(args);
   setDataLayout(mod);
-  cudaq::packArgs(*argData, args, kernelFunc,
-                  [](OpaqueArguments &, py::object &) { return false; });
+  auto callableArgHandler = [](cudaq::OpaqueArguments &argData,
+                               py::object &arg) {
+    if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) {
+      // Just give it some dummy data that will not be used.
+      // We synthesize away all callables, the block argument
+      // remains but it is not used, so just give argsCreator
+      // something, and we'll make sure its cleaned up.
+      long *ourAllocatedArg = new long();
+      argData.emplace_back(ourAllocatedArg,
+                           [](void *ptr) { delete static_cast<long *>(ptr); });
+      return true;
+    }
+    return false;
+  };
+  cudaq::packArgs(*argData, args, kernelFunc, callableArgHandler);
   return argData;
 }
 
@@ -157,7 +170,6 @@ ExecutionEngine *jitKernel(const std::string &name, MlirModule module,
     pm.addPass(cudaq::opt::createGenerateKernelExecution(
         {.startingArgIdx = startingArgIdx}));
     pm.addPass(cudaq::opt::createGenerateDeviceCodeLoader({.jitTime = true}));
-    pm.addPass(cudaq::opt::createReturnToOutputLog());
     pm.addPass(cudaq::opt::createLambdaLiftingPass());
     pm.addPass(cudaq::opt::createDistributedDeviceCall());
     std::string tl = getTransportLayer();
@@ -947,7 +959,7 @@ void bindAltLaunchKernel(py::module &mod,
 
   auto callableArgHandler = [](cudaq::OpaqueArguments &argData,
                                py::object &arg) {
-    if (py::hasattr(arg, "module")) {
+    if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) {
       // Just give it some dummy data that will not be used.
       // We synthesize away all callables, the block argument
       // remains but it is not used, so just give argsCreator
diff --git a/python/runtime/interop/PythonCppInterop.h b/python/runtime/interop/PythonCppInterop.h
index 9b39aada636..c74a8ec2872 100644
--- a/python/runtime/interop/PythonCppInterop.h
+++ b/python/runtime/interop/PythonCppInterop.h
@@ -7,6 +7,8 @@
  ******************************************************************************/
 #pragma once
 
+#include "cudaq/qis/qkernel.h"
+#include "cudaq/utils/registry.h"
 #include <pybind11/pybind11.h>
 
 namespace py = pybind11;
@@ -166,4 +168,28 @@ void addDeviceKernelInterop(py::module_ &m, const std::string &modName,
                                       kernelName, mangledArgs);
   return;
 }
+
+// Specialization for qkernel
+template <typename R, typename... Args>
+py::object convertQkernel(py::module_ &m, cudaq::qkernel<R(Args...)> &qkernel,
+                          const std::string &docstring = "") {
+  const char *qkernelName = cudaq::registry::getLinkableKernelNameOrNull(
+      cudaq::registry::__cudaq_getLinkableKernelKey(&qkernel));
+  if (!qkernelName)
+    throw std::runtime_error(
+        "Invalid `qkernel` passed, could not find registered kernel.");
+  std::string kernelName = qkernelName;
+  // Rremove "function_" prefix if exists
+  if (kernelName.starts_with("function_"))
+    kernelName = kernelName.substr(std::string("function_").length());
+  const std::string docStr =
+      docstring.empty()
+          ? "Auto-generated kernel from C++ " + kernelName + " qkernel."
+          : docstring;
+  m.def(
+      kernelName.c_str(), [](Args...) {}, docStr.c_str());
+  cudaq::python::registerDeviceKernel(m.attr("__name__").cast<std::string>(),
+                                      kernelName, "");
+  return m.attr(kernelName.c_str());
+}
 } // namespace cudaq::python
diff --git a/python/tests/interop/quantum_lib/quantum_lib.cpp b/python/tests/interop/quantum_lib/quantum_lib.cpp
index 7c5cbb23054..ed313b32f88 100644
--- a/python/tests/interop/quantum_lib/quantum_lib.cpp
+++ b/python/tests/interop/quantum_lib/quantum_lib.cpp
@@ -29,4 +29,12 @@ __qpu__ void qft(cudaq::qview<> qubits, const std::vector<double> &x,
 __qpu__ void another(cudaq::qview<> qubits, std::size_t i) { x(qubits[i]); }
 
 __qpu__ void uccsd(cudaq::qview<> qubits, std::size_t) { h(qubits[0]); }
+
+__qpu__ void reset_group(patch p) {
+  for (std::size_t i = 0; i < p.data.size(); i++)
+    reset(p.data[i]);
+}
+
+__qpu__ void x_group(patch p) { x(p.data); }
+
 } // namespace cudaq
diff --git a/python/tests/interop/quantum_lib/quantum_lib.h b/python/tests/interop/quantum_lib/quantum_lib.h
index a0655099237..4b9fa371351 100644
--- a/python/tests/interop/quantum_lib/quantum_lib.h
+++ b/python/tests/interop/quantum_lib/quantum_lib.h
@@ -9,6 +9,12 @@
 
 #include "cudaq/qis/qubit_qis.h"
 
+// Custom data structure
+struct patch {
+  cudaq::qview<> data;
+  cudaq::qview<> aux;
+};
+
 namespace cudaq {
 void entryPoint(const std::function<void(cudaq::qvector<> &)> &statePrep);
 
@@ -19,4 +25,8 @@ void another(cudaq::qview<> qubits, std::size_t);
 
 void uccsd(cudaq::qview<> qubits, std::size_t);
 
+void reset_group(patch p);
+
+void x_group(patch p);
+
 } // namespace cudaq
diff --git a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
index 4ea2d2176cc..92f1382f2a8 100644
--- a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
+++ b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
@@ -8,6 +8,7 @@
 
 #include "cudaq.h"
 #include "cudaq/algorithms/sample.h"
+#include "cudaq/qis/qkernel.h"
 #include "quantum_lib/quantum_lib.h"
 #include "runtime/interop/PythonCppInterop.h"
 #include <pybind11/pybind11.h>
@@ -15,6 +16,22 @@
 
 namespace py = pybind11;
 
+namespace {
+static std::unordered_map<std::string,
+                          cudaq::qkernel<void(cudaq::qview<>, std::size_t)>>
+    g_cppKernels_1;
+
+static std::unordered_map<std::string, cudaq::qkernel<void(patch)>>
+    g_cppKernels_2;
+
+static const bool initKernels = []() {
+  g_cppKernels_1.insert(std::make_pair("uccsd", cudaq::uccsd));
+  g_cppKernels_2.insert(std::make_pair("reset", cudaq::reset_group));
+  g_cppKernels_2.insert(std::make_pair("x", cudaq::x_group));
+  return true;
+}();
+} // namespace
+
 PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
 
   m.def("test_cpp_qalgo", [](py::object statePrepIn) {
@@ -49,4 +66,26 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
 
   cudaq::python::addDeviceKernelInterop<cudaq::qview<>, std::size_t>(
       m, "qstd", "uccsd", "");
+
+  // Convert the C++ kernel registry to Python-accessible kernels
+  auto interopSubMod = m.def_submodule("_cpp_interop_kernels");
+  static std::unordered_map<std::string, py::object> g_py_kernels;
+
+  for (auto &[name, kernel] : g_cppKernels_1) {
+    g_py_kernels.insert(std::make_pair(
+        name, cudaq::python::convertQkernel(interopSubMod, kernel)));
+  }
+
+  for (auto &[name, kernel] : g_cppKernels_2) {
+    g_py_kernels.insert(std::make_pair(
+        name, cudaq::python::convertQkernel(interopSubMod, kernel)));
+  }
+
+  m.def("get_cpp_kernel", [](const std::string &name) {
+    auto it = g_py_kernels.find(name);
+    if (it == g_py_kernels.end())
+      throw std::runtime_error("No C++ kernel registered for requested name.");
+
+    return it->second;
+  });
 }
diff --git a/python/tests/interop/test_interop.py b/python/tests/interop/test_interop.py
index e63588408ac..4324e79c02c 100644
--- a/python/tests/interop/test_interop.py
+++ b/python/tests/interop/test_interop.py
@@ -7,6 +7,8 @@
 # ============================================================================ #
 
 import cudaq, pytest
+from typing import Callable
+from dataclasses import dataclass
 
 cudaq_test_cpp_algo = pytest.importorskip('cudaq_test_cpp_algo')
 
@@ -242,3 +244,79 @@ def entry():
         takesCapture(spin)
 
     entry.compile()
+
+
+def test_cpp_qkernel():
+    # Test the `qkernel` provided in C++ via a map-like registry.
+    # This is provided as a function-like callable.
+    kernel_from_cpp_registry = cudaq_test_cpp_algo.get_cpp_kernel("uccsd")
+
+    # Use as a capture
+    @cudaq.kernel
+    def cpp_qkernel():
+        q = cudaq.qvector(4)
+        kernel_from_cpp_registry(q, 0)
+
+    cpp_qkernel()
+
+    # Use as a callable argument
+    @cudaq.kernel
+    def caller(k: Callable[[cudaq.qview, int], None]):
+        q = cudaq.qvector(4)
+        k(q, 0)
+
+    caller(kernel_from_cpp_registry)
+
+
+def test_cpp_custom_struct():
+    # Define a struct in Python that matches the C++ struct
+    # Note: use `repr=False` to annotate that this is an unnamed struct.
+    # This will maintain compatibility with C++ structs that do not have
+    # a name.
+    @dataclass(slots=True, repr=False)
+    class patch:
+        data: cudaq.qvector
+        aux: cudaq.qvector
+
+    reset_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("reset")
+    x_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("x")
+
+    # Use as a capture
+    @cudaq.kernel
+    def cpp_qkernel_struct():
+        q = cudaq.qvector(4)
+        r = cudaq.qvector(2)
+        x(q)
+        reset_qkernel(patch(q, r))
+
+    counts = cudaq.sample(cpp_qkernel_struct)
+    counts.dump()
+    assert len(counts) == 1 and '000000' in counts
+
+    @cudaq.kernel
+    def cpp_qkernel_struct_x():
+        q = cudaq.qvector(4)
+        r = cudaq.qvector(2)
+        x_qkernel(patch(q, r))
+
+    counts = cudaq.sample(cpp_qkernel_struct_x)
+    counts.dump()
+    assert len(counts) == 1 and '111100' in counts
+
+    # Callable
+    @cudaq.kernel
+    def cpp_qkernel_struct_callable(k: Callable[[patch], None]):
+        q = cudaq.qvector(4)
+        r = cudaq.qvector(2)
+        for i in range(4):
+            if i % 2 == 0:
+                x(q[i])
+        k(patch(q, r))
+
+    counts = cudaq.sample(cpp_qkernel_struct_callable, reset_qkernel)
+    counts.dump()
+    assert len(counts) == 1 and '000000' in counts
+
+    counts = cudaq.sample(cpp_qkernel_struct_callable, x_qkernel)
+    counts.dump()
+    assert len(counts) == 1 and '010100' in counts