Spaces:

DawnC
/

PawMatchAI

Running on Zero

App Files Files Community

DawnC commited on Dec 1, 2024

Commit

14ee6e4

•

1 Parent(s): f222f88

Update device_manager.py

Browse files

Files changed (1) hide show

device_manager.py +32 -22

device_manager.py CHANGED Viewed

@@ -1,6 +1,5 @@
 from functools import wraps
 import torch
-from huggingface_hub import HfApi
 import os
 import logging
@@ -22,48 +21,59 @@ class DeviceManager:
         self._initialized = True
         self._current_device = None
     def check_zero_gpu_availability(self):
         try:
-            # 檢查 Hugging Face Space 環境變數
-            if not os.environ.get('SPACE_ID'):
-                return False
-            # 檢查是否在 Spaces 環境中並且啟用了 ZeroGPU
-            if os.environ.get('ZERO_GPU_AVAILABLE') == '1':
-                return True
-            return False
         except Exception as e:
-            logger.warning(f"Error checking ZeroGPU availability: {e}")
-            return False
     def get_optimal_device(self):
         if self._current_device is None:
             if self.check_zero_gpu_availability():
                 try:
-                    # 確保 CUDA 可用
-                    if torch.cuda.is_available():
-                        self._current_device = torch.device('cuda')
-                        logger.info("Using ZeroGPU")
-                    else:
-                        raise RuntimeError("CUDA not available")
                 except Exception as e:
-                    logger.warning(f"Failed to initialize ZeroGPU: {e}")
                     self._current_device = torch.device('cpu')
-                    logger.info("Fallback to CPU due to GPU initialization failure")
             else:
                 self._current_device = torch.device('cpu')
                 logger.info("Using CPU (ZeroGPU not available)")
         return self._current_device
     def move_to_device(self, tensor_or_model):
         device = self.get_optimal_device()
         try:
             if hasattr(tensor_or_model, 'to'):
                 return tensor_or_model.to(device)
-        except Exception:
             self._current_device = torch.device('cpu')
             if hasattr(tensor_or_model, 'to'):
                 return tensor_or_model.to('cpu')

 from functools import wraps
 import torch
 import os
 import logging
         self._initialized = True
         self._current_device = None
+        self.initialize_zero_gpu()
+    def initialize_zero_gpu(self):
+        """初始化 ZeroGPU"""
+        try:
+            # 檢查是否在 Hugging Face Spaces 環境中
+            if os.environ.get('SPACE_ID'):
+                # 嘗試初始化 ZeroGPU
+                os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+                # 設置必要的環境變數
+                os.environ['ZERO_GPU'] = '1'
+                logger.info("ZeroGPU environment initialized")
+        except Exception as e:
+            logger.warning(f"Failed to initialize ZeroGPU environment: {e}")
     def check_zero_gpu_availability(self):
+        """檢查 ZeroGPU 是否可用"""
         try:
+            if os.environ.get('SPACE_ID') and os.environ.get('ZERO_GPU') == '1':
+                # 確保 CUDA 運行時環境正確設置
+                if torch.cuda.is_available():
+                    torch.cuda.init()
+                    return True
         except Exception as e:
+            logger.warning(f"ZeroGPU check failed: {e}")
+        return False
     def get_optimal_device(self):
+        """獲取最佳可用設備"""
         if self._current_device is None:
             if self.check_zero_gpu_availability():
                 try:
+                    self._current_device = torch.device('cuda')
+                    logger.info("Using ZeroGPU")
+                    # 嘗試進行一次小規模的 CUDA 操作來驗證
+                    torch.zeros(1).cuda()
                 except Exception as e:
+                    logger.warning(f"Failed to use ZeroGPU: {e}")
                     self._current_device = torch.device('cpu')
+                    logger.info("Fallback to CPU")
             else:
                 self._current_device = torch.device('cpu')
                 logger.info("Using CPU (ZeroGPU not available)")
         return self._current_device
     def move_to_device(self, tensor_or_model):
+        """將張量或模型移動到最佳設備"""
         device = self.get_optimal_device()
         try:
             if hasattr(tensor_or_model, 'to'):
                 return tensor_or_model.to(device)
+        except Exception as e:
+            logger.warning(f"Failed to move to {device}, falling back to CPU: {e}")
             self._current_device = torch.device('cpu')
             if hasattr(tensor_or_model, 'to'):
                 return tensor_or_model.to('cpu')