code · pull · Aug 4, 2024 · Aug 3, 2024 · Aug 3, 2024 · Aug 3, 2024
diff --git a/comfy/model_base.py b/comfy/model_base.py
@@ -94,6 +94,7 @@ def __init__(self, model_config, model_type=ModelType.EPS, device=None, unet_mod
         self.concat_keys = ()
         logging.info("model_type {}".format(model_type.name))
         logging.debug("adm {}".format(self.adm_channels))
+        logging.info("model weight dtype {}, manual cast: {}".format(self.get_dtype(), self.manual_cast_dtype))
         self.memory_usage_factor = model_config.memory_usage_factor
 
     def apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):

diff --git a/comfy/model_management.py b/comfy/model_management.py
@@ -450,7 +450,7 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
         if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM):
             model_size = loaded_model.model_memory_required(torch_dev)
             current_free_mem = get_free_memory(torch_dev)
-            lowvram_model_memory = int(max(64 * (1024 * 1024), (current_free_mem - minimum_memory_required)))
+            lowvram_model_memory = max(64 * (1024 * 1024), (current_free_mem - minimum_memory_required), min(current_free_mem * 0.4, current_free_mem - minimum_inference_memory()))
             if model_size <= lowvram_model_memory: #only switch to lowvram if really necessary
                 lowvram_model_memory = 0
 
@@ -527,6 +527,9 @@ def unet_inital_load_device(parameters, dtype):
     else:
         return cpu_dev
 
+def maximum_vram_for_weights(device=None):
+    return (get_total_memory(device) * 0.8 - minimum_inference_memory())
+
 def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32]):
     if args.bf16_unet:
         return torch.bfloat16
@@ -536,6 +539,21 @@ def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, tor
         return torch.float8_e4m3fn
     if args.fp8_e5m2_unet:
         return torch.float8_e5m2
+
+    fp8_dtype = None
+    try:
+        for dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+            if dtype in supported_dtypes:
+                fp8_dtype = dtype
+                break
+    except:
+        pass
+
+    if fp8_dtype is not None:
+        free_model_memory = maximum_vram_for_weights(device)
+        if model_params * 2 > free_model_memory:
+            return fp8_dtype
+
     if should_use_fp16(device=device, model_params=model_params, manual_cast=True):
         if torch.float16 in supported_dtypes:
             return torch.float16
@@ -871,7 +889,7 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
             fp16_works = True
 
     if fp16_works or manual_cast:
-        free_model_memory = (get_free_memory() * 0.9 - minimum_inference_memory())
+        free_model_memory = maximum_vram_for_weights(device)
         if (not prioritize_performance) or model_params * 4 > free_model_memory:
             return True
 
@@ -910,17 +928,14 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
     if is_intel_xpu():
         return True
 
-    if device is None:
-        device = torch.device("cuda")
-
-    props = torch.cuda.get_device_properties(device)
+    props = torch.cuda.get_device_properties("cuda")
     if props.major >= 8:
         return True
 
     bf16_works = torch.cuda.is_bf16_supported()
 
     if bf16_works or manual_cast:
-        free_model_memory = (get_free_memory() * 0.9 - minimum_inference_memory())
+        free_model_memory = maximum_vram_for_weights(device)
         if (not prioritize_performance) or model_params * 4 > free_model_memory:
             return True
 

diff --git a/comfy/samplers.py b/comfy/samplers.py
@@ -171,7 +171,7 @@ def calc_cond_batch(model, conds, x_in, timestep, model_options):
         for i in range(1, len(to_batch_temp) + 1):
             batch_amount = to_batch_temp[:len(to_batch_temp)//i]
             input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
-            if model.memory_required(input_shape) < free_memory:
+            if model.memory_required(input_shape) * 1.5 < free_memory:
                 to_batch = batch_amount
                 break
 

diff --git a/comfy/sd.py b/comfy/sd.py
@@ -510,13 +510,18 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
 
     diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd)
     parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix)
+    weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
     load_device = model_management.get_torch_device()
 
     model_config = model_detection.model_config_from_unet(sd, diffusion_model_prefix)
     if model_config is None:
         raise RuntimeError("ERROR: Could not detect model type of: {}".format(ckpt_path))
 
-    unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes)
+    unet_weight_dtype = list(model_config.supported_inference_dtypes)
+    if weight_dtype is not None:
+        unet_weight_dtype.append(weight_dtype)
+
+    unet_dtype = model_management.unet_dtype(model_params=parameters, supported_dtypes=unet_weight_dtype)
     manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes)
     model_config.set_inference_dtype(unet_dtype, manual_cast_dtype)
 

diff --git a/comfy/supported_models.py b/comfy/supported_models.py
@@ -652,7 +652,11 @@ def get_model(self, state_dict, prefix="", device=None):
         return out
 
     def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.FluxClipModel)
+        pref = self.text_encoder_key_prefix[0]
+        t5_key = "{}t5xxl.transformer.encoder.final_layer_norm.weight".format(pref)
+        if t5_key in state_dict:
+            dtype_t5 = state_dict[t5_key].dtype
+        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(dtype_t5=dtype_t5))
 
 class FluxSchnell(Flux):
     unet_config = {

diff --git a/comfy/utils.py b/comfy/utils.py
@@ -40,9 +40,22 @@ def calculate_parameters(sd, prefix=""):
     params = 0
     for k in sd.keys():
         if k.startswith(prefix):
-            params += sd[k].nelement()
+            w = sd[k]
+            params += w.nelement()
     return params
 
+def weight_dtype(sd, prefix=""):
+    dtypes = {}
+    for k in sd.keys():
+        if k.startswith(prefix):
+            w = sd[k]
+            dtypes[w.dtype] = dtypes.get(w.dtype, 0) + 1
+
+    if len(dtypes) == 0:
+        return None
+
+    return max(dtypes, key=dtypes.get)
+
 def state_dict_key_replace(state_dict, keys_to_replace):
     for x in keys_to_replace:
         if x in state_dict: