-
Couldn't load subscription status.
- Fork 119
Description
I think the problem is that we generally expect an input scaling of 1.0 for centered instance models since they're crops already. The training does handle this appropriately, but not the visualization for some reason (it's probably missing the input scaling transformer/preprocessing).
In general, I think we can solve this by switching to using the InferenceModel classes to generate visualizations so that we're not doing some custom inference routines inside of Trainer classes.
Here's the relevant error:
File "D:\anaconda\envs\sleap\lib\site-packages\sleap\nn\callbacks.py", line 280, in on_epoch_end
figure = self.plot_fn()
File "D:\anaconda\envs\sleap\lib\site-packages\sleap\nn\training.py", line 1328, in <lambda>
viz_fn=lambda: visualize_example(next(training_viz_ds_iter)),
File "D:\anaconda\envs\sleap\lib\site-packages\sleap\nn\training.py", line 1308, in visualize_example
preds = find_peaks(tf.expand_dims(example["instance_image"], axis=0))
File "D:\anaconda\envs\sleap\lib\site-packages\keras\engine\base_layer.py", line 1037, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "D:\anaconda\envs\sleap\lib\site-packages\sleap\nn\inference.py", line 1722, in call
out = self.keras_model(crops)
File "D:\anaconda\envs\sleap\lib\site-packages\keras\engine\base_layer.py", line 1020, in __call__
input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
File "D:\anaconda\envs\sleap\lib\site-packages\keras\engine\input_spec.py", line 269, in assert_input_compatibility
', found shape=' + display_shape(x.shape))
ValueError: Input 0 is incompatible with layer model: expected shape=(None, 128, 128, 3), found shape=(1, 32, 32, 3)
See issue below for more.
Discussed in #871
Originally posted by Shifulai July 29, 2022
Thank for your attention.
When I try to train the top-down centered instance model, the training cannot work when the input scaling is not 1.0. The train will stay at epoch1 but the runtime still add.
Bug report below
Software versions:
SLEAP: 1.2.6
TensorFlow: 2.6.3
Numpy: 1.19.5
Python: 3.7.12
OS: Windows-10-10.0.19041-SP0
Happy SLEAPing! :)
Using already trained model for centroid: D:/Desktop/CK/sleap/data\models\220729_134535.centroid.n=765\training_config.json
Resetting monitor window.
Polling: D:/Desktop/CK/sleap/data\models\220729_194813.centered_instance.n=765\viz\validation.*.png
Start training centered_instance...
['sleap-train', 'C:\\Users\\admin\\AppData\\Local\\Temp\\tmp1aqtnvzl\\220729_194813_training_job.json', 'D:/Desktop/CK/sleap/data/food competition.slp', '--zmq', '--save_viz']
INFO:sleap.nn.training:Versions:
SLEAP: 1.2.6
TensorFlow: 2.6.3
Numpy: 1.19.5
Python: 3.7.12
OS: Windows-10-10.0.19041-SP0
INFO:sleap.nn.training:Training labels file: D:/Desktop/CK/sleap/data/food competition.slp
INFO:sleap.nn.training:Training profile: C:\Users\admin\AppData\Local\Temp\tmp1aqtnvzl\220729_194813_training_job.json
INFO:sleap.nn.training:
INFO:sleap.nn.training:Arguments:
INFO:sleap.nn.training:{
"training_job_path": "C:\\Users\\admin\\AppData\\Local\\Temp\\tmp1aqtnvzl\\220729_194813_training_job.json",
"labels_path": "D:/Desktop/CK/sleap/data/food competition.slp",
"video_paths": [
""
],
"val_labels": null,
"test_labels": null,
"tensorboard": false,
"save_viz": true,
"zmq": true,
"run_name": "",
"prefix": "",
"suffix": "",
"cpu": false,
"first_gpu": false,
"last_gpu": false,
"gpu": 0
}
INFO:sleap.nn.training:
INFO:sleap.nn.training:Training job:
INFO:sleap.nn.training:{
"data": {
"labels": {
"training_labels": null,
"validation_labels": null,
"validation_fraction": 0.1,
"test_labels": null,
"split_by_inds": false,
"training_inds": null,
"validation_inds": null,
"test_inds": null,
"search_path_hints": [],
"skeletons": []
},
"preprocessing": {
"ensure_rgb": false,
"ensure_grayscale": false,
"imagenet_mode": null,
"input_scaling": 0.25,
"pad_to_stride": null,
"resize_and_pad_to_target": true,
"target_height": null,
"target_width": null
},
"instance_cropping": {
"center_on_part": "tail",
"crop_size": null,
"crop_size_detection_padding": 16
}
},
"model": {
"backbone": {
"leap": null,
"unet": {
"stem_stride": null,
"max_stride": 16,
"output_stride": 8,
"filters": 16,
"filters_rate": 1.5,
"middle_block": true,
"up_interpolate": true,
"stacks": 1
},
"hourglass": null,
"resnet": null,
"pretrained_encoder": null
},
"heads": {
"single_instance": null,
"centroid": null,
"centered_instance": {
"anchor_part": "tail",
"part_names": null,
"sigma": 2.5,
"output_stride": 8,
"loss_weight": 1.0,
"offset_refinement": false
},
"multi_instance": null,
"multi_class_bottomup": null,
"multi_class_topdown": null
}
},
"optimization": {
"preload_data": true,
"augmentation_config": {
"rotate": true,
"rotation_min_angle": -180.0,
"rotation_max_angle": 180.0,
"translate": false,
"translate_min": -5,
"translate_max": 5,
"scale": false,
"scale_min": 0.9,
"scale_max": 1.1,
"uniform_noise": false,
"uniform_noise_min_val": 0.0,
"uniform_noise_max_val": 10.0,
"gaussian_noise": false,
"gaussian_noise_mean": 5.0,
"gaussian_noise_stddev": 1.0,
"contrast": true,
"contrast_min_gamma": 0.5,
"contrast_max_gamma": 2.0,
"brightness": true,
"brightness_min_val": 0.0,
"brightness_max_val": 10.0,
"random_crop": false,
"random_crop_height": 256,
"random_crop_width": 256,
"random_flip": false,
"flip_horizontal": true
},
"online_shuffling": true,
"shuffle_buffer_size": 128,
"prefetch": true,
"batch_size": 4,
"batches_per_epoch": null,
"min_batches_per_epoch": 200,
"val_batches_per_epoch": null,
"min_val_batches_per_epoch": 10,
"epochs": 200,
"optimizer": "adam",
"initial_learning_rate": 0.0001,
"learning_rate_schedule": {
"reduce_on_plateau": true,
"reduction_factor": 0.5,
"plateau_min_delta": 1e-06,
"plateau_patience": 5,
"plateau_cooldown": 3,
"min_learning_rate": 1e-08
},
"hard_keypoint_mining": {
"online_mining": false,
"hard_to_easy_ratio": 2.0,
"min_hard_keypoints": 2,
"max_hard_keypoints": null,
"loss_scale": 5.0
},
"early_stopping": {
"stop_training_on_plateau": true,
"plateau_min_delta": 1e-08,
"plateau_patience": 20
}
},
"outputs": {
"save_outputs": true,
"run_name": "220729_194813.centered_instance.n=765",
"run_name_prefix": "",
"run_name_suffix": "",
"runs_folder": "D:/Desktop/CK/sleap/data\\models",
"tags": [
""
],
"save_visualizations": true,
"delete_viz_images": true,
"zip_outputs": false,
"log_to_csv": true,
"checkpointing": {
"initial_model": false,
"best_model": true,
"every_epoch": false,
"latest_model": false,
"final_model": false
},
"tensorboard": {
"write_logs": false,
"loss_frequency": "epoch",
"architecture_graph": false,
"profile_graph": false,
"visualizations": true
},
"zmq": {
"subscribe_to_controller": true,
"controller_address": "tcp://127.0.0.1:9000",
"controller_polling_timeout": 10,
"publish_updates": true,
"publish_address": "tcp://127.0.0.1:9001"
}
},
"name": "",
"description": "",
"sleap_version": "1.2.6",
"filename": "C:\\Users\\admin\\AppData\\Local\\Temp\\tmp1aqtnvzl\\220729_194813_training_job.json"
}
INFO:sleap.nn.training:
INFO:sleap.nn.training:Using GPU 0 for acceleration.
INFO:sleap.nn.training:Disabled GPU memory pre-allocation.
INFO:sleap.nn.training:System:
GPUs: 1/1 available
Device: /physical_device:GPU:0
Available: True
Initalized: False
Memory growth: True
INFO:sleap.nn.training:
INFO:sleap.nn.training:Initializing trainer...
INFO:sleap.nn.training:Loading training labels from: D:/Desktop/CK/sleap/data/food competition.slp
INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1
INFO:sleap.nn.training: Splits: Training = 689 / Validation = 76.
INFO:sleap.nn.training:Setting up for training...
INFO:sleap.nn.training:Setting up pipeline builders...
INFO:sleap.nn.training:Setting up model...
INFO:sleap.nn.training:Building test pipeline...
2022-07-29 19:48:31.149710: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-29 19:48:33.014129: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3489 MB memory: -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6
2022-07-29 19:48:35.801343: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-07-29 19:48:47.065434: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 480 } dim { size: 270 } dim { size: 3 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } } device { type: "CPU" vendor: "GenuineIntel" model: "101" frequency: 2304 num_cores: 16 environment { key: "cpu_instruction_set" value: "SSE, SSE2" } environment { key: "eigen" value: "3.3.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 16777216 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 128 } dim { size: 128 } dim { size: 3 } } }
INFO:sleap.nn.training:Loaded test example. [22.266s]
INFO:sleap.nn.training: Input shape: (128, 128, 3)
INFO:sleap.nn.training:Created Keras model.
INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=1.5, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=4, middle_block=True, up_blocks=1, up_interpolate=True, block_contraction=False)
INFO:sleap.nn.training: Max stride: 16
INFO:sleap.nn.training: Parameters: 265,575
INFO:sleap.nn.training: Heads:
INFO:sleap.nn.training: [0] = CenteredInstanceConfmapsHead(part_names=['nose', 'hear_r', 'hear_l', 'tail'], anchor_part='tail', sigma=2.5, output_stride=8, loss_weight=1.0)
INFO:sleap.nn.training: Outputs:
INFO:sleap.nn.training: [0] = KerasTensor(type_spec=TensorSpec(shape=(None, 16, 16, 4), dtype=tf.float32, name=None), name='CenteredInstanceConfmapsHead/BiasAdd:0', description="created by layer 'CenteredInstanceConfmapsHead'")
INFO:sleap.nn.training:Setting up data pipelines...
INFO:sleap.nn.training:Training set: n = 689
INFO:sleap.nn.training:Validation set: n = 76
INFO:sleap.nn.training:Setting up optimization...
INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08)
INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-08, plateau_patience=20)
INFO:sleap.nn.training:Setting up outputs...
INFO:sleap.nn.callbacks:Training controller subscribed to: tcp://127.0.0.1:9000 (topic: )
INFO:sleap.nn.training: ZMQ controller subcribed to: tcp://127.0.0.1:9000
INFO:sleap.nn.callbacks:Progress reporter publishing on: tcp://127.0.0.1:9001 for: not_set
INFO:sleap.nn.training: ZMQ progress reporter publish on: tcp://127.0.0.1:9001
INFO:sleap.nn.training:Created run path: D:/Desktop/CK/sleap/data\models\220729_194813.centered_instance.n=765
INFO:sleap.nn.training:Setting up visualization...
2022-07-29 19:48:59.507634: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 1920 } dim { size: 1080 } dim { size: 3 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } } device { type: "CPU" vendor: "GenuineIntel" model: "101" frequency: 2304 num_cores: 16 environment { key: "cpu_instruction_set" value: "SSE, SSE2" } environment { key: "eigen" value: "3.3.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 16777216 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { dim { size: -2 }
dim { size: 128 } dim { size: 128 } dim { size: 3 } } }
2022-07-29 19:49:07.684222: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 1920 } dim { size: 1080 } dim { size: 3 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } } device { type: "CPU" vendor: "GenuineIntel" model: "101" frequency: 2304 num_cores: 16 environment { key: "cpu_instruction_set" value: "SSE, SSE2" } environment { key: "eigen" value: "3.3.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 16777216 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { dim { size: -2 }
dim { size: 128 } dim { size: 128 } dim { size: 3 } } }
INFO:sleap.nn.training:Finished trainer set up. [41.8s]
INFO:sleap.nn.training:Creating tf.data.Datasets for training data generation...
2022-07-29 19:55:14.233259: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 480 } dim { size: 270 } dim { size: 3 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } } device { type: "CPU" vendor: "GenuineIntel" model: "101" frequency: 2304 num_cores: 16 environment { key: "cpu_instruction_set" value: "SSE, SSE2" } environment { key: "eigen" value: "3.3.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 16777216 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 128 } dim { size: 128 } dim { size: 3 } } }
2022-07-29 19:55:31.551806: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 480 } dim { size: 270 } dim { size: 3 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } } device { type: "CPU" vendor: "GenuineIntel" model: "101" frequency: 2304 num_cores: 16 environment { key: "cpu_instruction_set" value: "SSE, SSE2" } environment { key: "eigen" value: "3.3.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 16777216 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 128 } dim { size: 128 } dim { size: 3 } } }
INFO:sleap.nn.training:Finished creating training datasets. [384.0s]
INFO:sleap.nn.training:Starting training loop...
2022-07-29 19:55:32.101723: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 480 } dim { size: 270 } dim { size: 3 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } } device { type: "CPU" vendor: "GenuineIntel" model: "101" frequency: 2304 num_cores: 16 environment { key: "cpu_instruction_set" value: "SSE, SSE2" } environment { key: "eigen" value: "3.3.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 16777216 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 128 } dim { size: 128 } dim { size: 3 } } }
Epoch 1/200
2022-07-29 19:55:33.962928: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8201
WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0000s vs `on_train_batch_end` time: 0.0156s). Check your callbacks.
2022-07-29 19:55:58.738032: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "CropAndResize" attr { key: "T" value { type: DT_FLOAT } } attr { key: "extrapolation_value" value { f: 0 } } attr { key: "method" value { s: "bilinear" } } inputs { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 480 } dim { size: 270 } dim { size: 3 } } } inputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 4 } } } inputs { dtype: DT_INT32 shape { dim { size: -2 } } } inputs { dtype: DT_INT32 shape { dim { size: 2 } } } device { type: "CPU" vendor: "GenuineIntel" model: "101" frequency: 2304 num_cores: 16 environment { key: "cpu_instruction_set" value: "SSE, SSE2" } environment { key: "eigen" value: "3.3.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 16777216 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { dim { size: -2 } dim { size: 128 } dim { size: 128 } dim { size: 3 } } }
344/344 - 30s - loss: 0.0325 - nose: 0.0373 - hear_r: 0.0405 - hear_l: 0.0394 - tail: 0.0128 - val_loss: 0.0242 - val_nose: 0.0294 - val_hear_r: 0.0325 - val_hear_l: 0.0307 - val_tail: 0.0041
Traceback (most recent call last):
File "D:\anaconda\envs\sleap\Scripts\sleap-train-script.py", line 33, in <module>
sys.exit(load_entry_point('sleap==1.2.6', 'console_scripts', 'sleap-train')())
File "D:\anaconda\envs\sleap\lib\site-packages\sleap\nn\training.py", line 1955, in main
trainer.train()
File "D:\anaconda\envs\sleap\lib\site-packages\sleap\nn\training.py", line 923, in train
verbose=2,
File "D:\anaconda\envs\sleap\lib\site-packages\keras\engine\training.py", line 1230, in fit
callbacks.on_epoch_end(epoch, epoch_logs)
File "D:\anaconda\envs\sleap\lib\site-packages\keras\callbacks.py", line 413, in on_epoch_end
callback.on_epoch_end(epoch, logs)
File "D:\anaconda\envs\sleap\lib\site-packages\sleap\nn\callbacks.py", line 280, in on_epoch_end
figure = self.plot_fn()
File "D:\anaconda\envs\sleap\lib\site-packages\sleap\nn\training.py", line 1328, in <lambda>
viz_fn=lambda: visualize_example(next(training_viz_ds_iter)),
File "D:\anaconda\envs\sleap\lib\site-packages\sleap\nn\training.py", line 1308, in visualize_example
preds = find_peaks(tf.expand_dims(example["instance_image"], axis=0))
File "D:\anaconda\envs\sleap\lib\site-packages\keras\engine\base_layer.py", line 1037, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "D:\anaconda\envs\sleap\lib\site-packages\sleap\nn\inference.py", line 1722, in call
out = self.keras_model(crops)
File "D:\anaconda\envs\sleap\lib\site-packages\keras\engine\base_layer.py", line 1020, in __call__
input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
File "D:\anaconda\envs\sleap\lib\site-packages\keras\engine\input_spec.py", line 269, in assert_input_compatibility
', found shape=' + display_shape(x.shape))
ValueError: Input 0 is incompatible with layer model: expected shape=(None, 128, 128, 3), found shape=(1, 32, 32, 3)
INFO:sleap.nn.callbacks:Closing the reporter controller/context.
INFO:sleap.nn.callbacks:Closing the training controller socket/context.