From 6668e8749bde78f0d3211420e27d3b3aac26891d Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Sat, 12 Oct 2024 22:34:29 +0900
Subject: [PATCH 1/3] support cuda_malloc

imported from comfy:
https://github.com/comfyanonymous/ComfyUI/blob/f1d6cef71c70719cc3ed45a2455a4e5ac910cd5e/cuda_malloc.py

original commits:
 - https://github.com/comfyanonymous/ComfyUI/commit/799c08a: Auto disable cuda malloc on some GPUs on windows.
 - https://github.com/comfyanonymous/ComfyUI/commit/D39c58b: Disable cuda malloc on GTX 750 Ti.
 - https://github.com/comfyanonymous/ComfyUI/commit/85a8900: Disable cuda malloc on regular GTX 960.
 - https://github.com/comfyanonymous/ComfyUI/commit/30de083: Disable cuda malloc on all the 9xx series.
 - https://github.com/comfyanonymous/ComfyUI/commit/7c0a5a3: Disable cuda malloc on a bunch of quadro cards.
 - https://github.com/comfyanonymous/ComfyUI/commit/5a90d3c: GeForce MX110 + MX130 are maxwell.
 - https://github.com/comfyanonymous/ComfyUI/commit/fc71cf6: Add some 800M gpus to cuda malloc blacklist.
 - https://github.com/comfyanonymous/ComfyUI/commit/861fd58: Add a warning if a card that doesn't support cuda malloc has it enabled.
 - https://github.com/comfyanonymous/ComfyUI/commit/192ca06: Add some more cards to the cuda malloc blacklist.
 - https://github.com/comfyanonymous/ComfyUI/commit/caddef8: Auto disable cuda malloc on unsupported GPUs on Linux.
 - https://github.com/comfyanonymous/ComfyUI/commit/2f93b91: Add Tesla GPUs to cuda malloc blacklist.
---
 cuda_malloc.py | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++
 webui.py       |  3 ++
 2 files changed, 100 insertions(+)
 create mode 100644 cuda_malloc.py

diff --git a/cuda_malloc.py b/cuda_malloc.py
new file mode 100644
index 00000000000..41bd1368ee8
--- /dev/null
+++ b/cuda_malloc.py
@@ -0,0 +1,97 @@
+# from comfyui with minor modification
+import os
+import importlib.util
+import subprocess
+
+from modules import cmd_args
+
+#Can't use pytorch to get the GPU names because the cuda malloc has to be set before the first import.
+def get_gpu_names():
+    if os.name == 'nt':
+        import ctypes
+
+        # Define necessary C structures and types
+        class DISPLAY_DEVICEA(ctypes.Structure):
+            _fields_ = [
+                ('cb', ctypes.c_ulong),
+                ('DeviceName', ctypes.c_char * 32),
+                ('DeviceString', ctypes.c_char * 128),
+                ('StateFlags', ctypes.c_ulong),
+                ('DeviceID', ctypes.c_char * 128),
+                ('DeviceKey', ctypes.c_char * 128)
+            ]
+
+        # Load user32.dll
+        user32 = ctypes.windll.user32
+
+        # Call EnumDisplayDevicesA
+        def enum_display_devices():
+            device_info = DISPLAY_DEVICEA()
+            device_info.cb = ctypes.sizeof(device_info)
+            device_index = 0
+            gpu_names = set()
+
+            while user32.EnumDisplayDevicesA(None, device_index, ctypes.byref(device_info), 0):
+                device_index += 1
+                gpu_names.add(device_info.DeviceString.decode('utf-8'))
+            return gpu_names
+        return enum_display_devices()
+    else:
+        gpu_names = set()
+        out = subprocess.check_output(['nvidia-smi', '-L'])
+        for l in out.split(b'\n'):
+            if len(l) > 0:
+                gpu_names.add(l.decode('utf-8').split(' (UUID')[0])
+        return gpu_names
+
+blacklist = {"GeForce GTX TITAN X", "GeForce GTX 980", "GeForce GTX 970", "GeForce GTX 960", "GeForce GTX 950", "GeForce 945M",
+                "GeForce 940M", "GeForce 930M", "GeForce 920M", "GeForce 910M", "GeForce GTX 750", "GeForce GTX 745", "Quadro K620",
+                "Quadro K1200", "Quadro K2200", "Quadro M500", "Quadro M520", "Quadro M600", "Quadro M620", "Quadro M1000",
+                "Quadro M1200", "Quadro M2000", "Quadro M2200", "Quadro M3000", "Quadro M4000", "Quadro M5000", "Quadro M5500", "Quadro M6000",
+                "GeForce MX110", "GeForce MX130", "GeForce 830M", "GeForce 840M", "GeForce GTX 850M", "GeForce GTX 860M",
+                "GeForce GTX 1650", "GeForce GTX 1630", "Tesla M4", "Tesla M6", "Tesla M10", "Tesla M40", "Tesla M60"
+                }
+
+def cuda_malloc_supported():
+    try:
+        names = get_gpu_names()
+    except:
+        names = set()
+    for x in names:
+        if "NVIDIA" in x:
+            for b in blacklist:
+                if b in x:
+                    return False
+    return True
+
+
+parser = cmd_args.parser
+args, _ = parser.parse_known_args()
+
+
+if not args.cuda_malloc:
+    try:
+        version = ""
+        torch_spec = importlib.util.find_spec("torch")
+        for folder in torch_spec.submodule_search_locations:
+            ver_file = os.path.join(folder, "version.py")
+            if os.path.isfile(ver_file):
+                spec = importlib.util.spec_from_file_location("torch_version_import", ver_file)
+                module = importlib.util.module_from_spec(spec)
+                spec.loader.exec_module(module)
+                version = module.__version__
+        if int(version[0]) >= 2: #enable by default for torch version 2.0 and up
+            args.cuda_malloc = cuda_malloc_supported()
+    except:
+        pass
+
+
+if args.cuda_malloc and not args.disable_cuda_malloc:
+    env_var = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', None)
+    if env_var is None:
+        env_var = "backend:cudaMallocAsync"
+    else:
+        env_var += ",backend:cudaMallocAsync"
+
+    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = env_var
+    print(f"Setup environment PYTORCH_CUDA_ALLOC_CONF={env_var}")
diff --git a/webui.py b/webui.py
index 2c417168aa6..0567668d8b4 100644
--- a/webui.py
+++ b/webui.py
@@ -10,6 +10,9 @@
 startup_timer = timer.startup_timer
 startup_timer.record("launcher")
 
+import cuda_malloc
+startup_timer.record("cuda_malloc")
+
 initialize.imports()
 
 initialize.check_versions()

From e78be27e7540ac33ed6b5668ed5e71f55a5f7667 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Sat, 12 Oct 2024 22:49:14 +0900
Subject: [PATCH 2/3] add --cuda-malloc, --disable-cuda-malloc cmd args

---
 modules/cmd_args.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/cmd_args.py b/modules/cmd_args.py
index d71982b2c12..2947ec395e6 100644
--- a/modules/cmd_args.py
+++ b/modules/cmd_args.py
@@ -9,6 +9,9 @@
 parser.add_argument("--update-all-extensions", action='store_true', help="launch.py argument: download updates for all extensions when starting the program")
 parser.add_argument("--skip-python-version-check", action='store_true', help="launch.py argument: do not check python version")
 parser.add_argument("--skip-torch-cuda-test", action='store_true', help="launch.py argument: do not check if CUDA is able to work properly")
+cm_group = parser.add_mutually_exclusive_group()
+cm_group.add_argument("--cuda-malloc", action='store_true', help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
+cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.")
 parser.add_argument("--reinstall-xformers", action='store_true', help="launch.py argument: install the appropriate version of xformers even if you have some version already installed")
 parser.add_argument("--reinstall-torch", action='store_true', help="launch.py argument: install the appropriate version of torch even if you have some version already installed")
 parser.add_argument("--update-check", action='store_true', help="launch.py argument: check for updates at startup")

From 0cc81464bb9482195941baa42b3c88963345bfe3 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Sat, 12 Oct 2024 22:52:40 +0900
Subject: [PATCH 3/3] lint, add init_cuda_malloc()

---
 cuda_malloc.py | 28 ++++++++++++++--------------
 webui.py       |  3 ++-
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/cuda_malloc.py b/cuda_malloc.py
index 41bd1368ee8..fae5d73cfdc 100644
--- a/cuda_malloc.py
+++ b/cuda_malloc.py
@@ -39,9 +39,9 @@ def enum_display_devices():
     else:
         gpu_names = set()
         out = subprocess.check_output(['nvidia-smi', '-L'])
-        for l in out.split(b'\n'):
-            if len(l) > 0:
-                gpu_names.add(l.decode('utf-8').split(' (UUID')[0])
+        for line in out.split(b'\n'):
+            if len(line) > 0:
+                gpu_names.add(line.decode('utf-8').split(' (UUID')[0])
         return gpu_names
 
 blacklist = {"GeForce GTX TITAN X", "GeForce GTX 980", "GeForce GTX 970", "GeForce GTX 960", "GeForce GTX 950", "GeForce 945M",
@@ -55,7 +55,7 @@ def enum_display_devices():
 def cuda_malloc_supported():
     try:
         names = get_gpu_names()
-    except:
+    except Exception:
         names = set()
     for x in names:
         if "NVIDIA" in x:
@@ -82,16 +82,16 @@ def cuda_malloc_supported():
                 version = module.__version__
         if int(version[0]) >= 2: #enable by default for torch version 2.0 and up
             args.cuda_malloc = cuda_malloc_supported()
-    except:
+    except Exception:
         pass
 
+def init_cuda_malloc():
+    if args.cuda_malloc and not args.disable_cuda_malloc:
+        env_var = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', None)
+        if env_var is None:
+            env_var = "backend:cudaMallocAsync"
+        else:
+            env_var += ",backend:cudaMallocAsync"
 
-if args.cuda_malloc and not args.disable_cuda_malloc:
-    env_var = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', None)
-    if env_var is None:
-        env_var = "backend:cudaMallocAsync"
-    else:
-        env_var += ",backend:cudaMallocAsync"
-
-    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = env_var
-    print(f"Setup environment PYTORCH_CUDA_ALLOC_CONF={env_var}")
+        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = env_var
+        print(f"Setup environment PYTORCH_CUDA_ALLOC_CONF={env_var}")
diff --git a/webui.py b/webui.py
index 0567668d8b4..aef977f7c20 100644
--- a/webui.py
+++ b/webui.py
@@ -10,7 +10,8 @@
 startup_timer = timer.startup_timer
 startup_timer.record("launcher")
 
-import cuda_malloc
+from cuda_malloc import init_cuda_malloc
+init_cuda_malloc()
 startup_timer.record("cuda_malloc")
 
 initialize.imports()