Cache _Reduce and _get_type_info results

NVIDIA · Dec 2, 2024 · 31ee5a5 · 31ee5a5
1 parent af0a8bb
commit 31ee5a5
Showing 1 changed file with 27 additions and 9 deletions.
diff --git a/python/cuda_parallel/cuda/parallel/experimental/__init__.py b/python/cuda_parallel/cuda/parallel/experimental/__init__.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import functools
 import importlib
 import ctypes
 import shutil
@@ -89,6 +90,7 @@ class _CCCLValue(ctypes.Structure):
                 ("state", ctypes.c_void_p)]
 
 
+@functools.cache
 def _type_to_info(numpy_type):
     numba_type = numba.from_dtype(numpy_type)
     context = cuda.descriptor.cuda_target.target_context
@@ -194,14 +196,15 @@ def _dtype_validation(dt1, dt2):
 
 
 class _Reduce:
-    def __init__(self, d_in, d_out, op, init):
+    # TODO: constructor shouldn't require concrete `d_in`, `d_out`:
+    def __init__(self, d_in, d_out, op, h_init):
         self._ctor_d_in_dtype = d_in.dtype
         self._ctor_d_out_dtype = d_out.dtype
-        self._ctor_init_dtype = init.dtype
+        self._ctor_init_dtype = h_init.dtype
         cc_major, cc_minor = cuda.get_current_device().compute_capability
         cub_path, thrust_path, libcudacxx_path, cuda_include_path = _get_paths()
         bindings = _get_bindings()
-        accum_t = init.dtype
+        accum_t = h_init.dtype
         self.op_wrapper = _Op(accum_t, op)
         d_in_ptr = _device_array_to_pointer(d_in)
         d_out_ptr = _device_array_to_pointer(d_out)
@@ -212,7 +215,7 @@ def __init__(self, d_in, d_out, op, init):
                                                   d_in_ptr,
                                                   d_out_ptr,
                                                   self.op_wrapper.handle(),
-                                                  _host_array_to_value(init),
+                                                  _host_array_to_value(h_init),
                                                   cc_major,
                                                   cc_minor,
                                                   ctypes.c_char_p(cub_path),
@@ -223,11 +226,11 @@ def __init__(self, d_in, d_out, op, init):
         if error != enums.CUDA_SUCCESS:
             raise ValueError('Error building reduce')
 
-    def __call__(self, temp_storage, d_in, d_out, init):
+    def __call__(self, temp_storage, d_in, d_out, h_init):
         # TODO validate POINTER vs ITERATOR when iterator support is added
         _dtype_validation(self._ctor_d_in_dtype, d_in.dtype)
         _dtype_validation(self._ctor_d_out_dtype, d_out.dtype)
-        _dtype_validation(self._ctor_init_dtype, init.dtype)
+        _dtype_validation(self._ctor_init_dtype, h_init.dtype)
         bindings = _get_bindings()
         if temp_storage is None:
             temp_storage_bytes = ctypes.c_size_t()
@@ -247,7 +250,7 @@ def __call__(self, temp_storage, d_in, d_out, init):
                                             d_out_ptr,
                                             num_items,
                                             self.op_wrapper.handle(),
-                                            _host_array_to_value(init),
+                                            _host_array_to_value(h_init),
                                             None)
         if error != enums.CUDA_SUCCESS:
             raise ValueError('Error reducing')
@@ -259,10 +262,25 @@ def __del__(self):
         bindings.cccl_device_reduce_cleanup(ctypes.byref(self.build_result))
 
 
+def _get_reducer():
+    # note: currently this is an unbounded cache:
+    cache = {}
+    def inner(d_in, d_out, op, init):
+        key = (d_in.dtype, d_out.dtype, op, init.dtype)
+        if key in cache:
+            return cache[key]
+        else:
+            result = _Reduce(d_in, d_out, op, init)
+            cache[key] = result
+            return result
+    return inner
+
+get_reducer = _get_reducer()
+
 # TODO Figure out iterators
 # TODO Figure out `sum` without operator and initial value
 # TODO Accept stream
-def reduce_into(d_in, d_out, op, init):
+def reduce_into(d_in, d_out, op, h_init):
     """Computes a device-wide reduction using the specified binary ``op`` functor and initial value ``init``.
 
     Example:
@@ -292,4 +310,4 @@ def reduce_into(d_in, d_out, op, init):
     Returns:
         A callable object that can be used to perform the reduction
     """
-    return _Reduce(d_in, d_out, op, init)
+    return get_reducer(d_in, d_out, op, h_init)