我正在尝试创建我自己的 CNN 版本,它能够判断两张图像是否属于同一个人(面部匹配),以及其中一张图像是否真的是一个人的照片(活体检测)。我能够构建数据集并训练我的模型(使用 Kaggle 笔记本提供的免费 CPU),但模型训练速度非常慢(每个周期约 10 小时)。因此,我一直在尝试启用 TPU 加速,但没有成功。我在训练循环中遇到以下错误,我无法理解:
TypeError: in user code:
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:805 train_function *
return step_function(self, iterator)
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:795 step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
/opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/tpu_strategy.py:540 run
return self.extended.tpu_run(fn, args, kwargs, options)
/opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/tpu_strategy.py:1296 tpu_run
return func(args, kwargs)
/opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/tpu_strategy.py:1345 tpu_function
maximum_shape = tensor_shape.TensorShape([None] * rank)
TypeError: can't multiply sequence by non-int of type 'NoneType'
深入研究问题,我发现当我尝试迭代数据集时出现以下错误:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/tensorflow/python/data/ops/iterator_ops.py in \_next_internal(self)
736 # Fast path for the case `self._structure` is not a nested structure.
\--\> 737 return self.\_element_spec.\_from_compatible_tensor_list(ret) # pylint: disable=protected-access
738 except AttributeError:
AttributeError: 'tuple' object has no attribute '\_from_compatible_tensor_list'
During handling of the above exception, another exception occurred:
UnavailableError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/context.py in execution_mode(mode)
2112 ctx.executor = executor_new
\-\> 2113 yield
2114 finally:
/opt/conda/lib/python3.7/site-packages/tensorflow/python/data/ops/iterator_ops.py in \_next_internal(self)
738 except AttributeError:
\--\> 739 return structure.from_compatible_tensor_list(self.\_element_spec, ret)
740
/opt/conda/lib/python3.7/site-packages/tensorflow/python/data/util/structure.py in from_compatible_tensor_list(element_spec, tensor_list)
243 lambda spec, value: spec.\_from_compatible_tensor_list(value),
\--\> 244 element_spec, tensor_list)
245
/opt/conda/lib/python3.7/site-packages/tensorflow/python/data/util/structure.py in \_from_tensor_list_helper(decode_fn, element_spec, tensor_list)
218 value = tensor_list\[i:i + num_flat_values\]
\--\> 219 flat_ret.append(decode_fn(component_spec, value))
220 i += num_flat_values
/opt/conda/lib/python3.7/site-packages/tensorflow/python/data/util/structure.py in \<lambda\>(spec, value)
242 return \_from_tensor_list_helper(
\--\> 243 lambda spec, value: spec.\_from_compatible_tensor_list(value),
244 element_spec, tensor_list)
/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/tensor_spec.py in \_from_compatible_tensor_list(self, tensor_list)
176 assert len(tensor_list) == 1
\--\> 177 tensor_list\[0\].set_shape(self.\_shape)
178 return tensor_list\[0\]
/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py in set_shape(self, shape)
1213 def set_shape(self, shape):
\-\> 1214 if not self.shape.is_compatible_with(shape):
1215 raise ValueError(
/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py in shape(self)
1174 # `EagerTensor`, in C.
\-\> 1175 self.\_tensor_shape = tensor_shape.TensorShape(self.\_shape_tuple())
1176 except core.\_NotOkStatusException as e:
UnavailableError: failed to connect to all addresses
Additional GRPC error information from remote target /job:localhost/replica:0/task:0/device:CPU:0:
:{"created":"@1669489130.049337898","description":"Failed to pick subchannel","file":"third_party/grpc/src/core/ext/filters/client_channel/client_channel.cc","file_line":4143,"referenced_errors":\[{"created":"@1669489130.049336302","description":"failed to connect to all addresses","file":"third_party/grpc/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc","file_line":398,"grpc_status":14}\]}
During handling of the above exception, another exception occurred:
UnavailableError Traceback (most recent call last)
/tmp/ipykernel_20/3230907211.py in \<module\>
\----\> 1 for i in dataset.take(1):
2 print(i)
/opt/conda/lib/python3.7/site-packages/tensorflow/python/data/ops/iterator_ops.py in __next__(self)
745 def __next__(self):
746 try:
\--\> 747 return self.\_next_internal()
748 except errors.OutOfRangeError:
749 raise StopIteration
/opt/conda/lib/python3.7/site-packages/tensorflow/python/data/ops/iterator_ops.py in \_next_internal(self)
737 return self.\_element_spec.\_from_compatible_tensor_list(ret) # pylint: disable=protected-access
738 except AttributeError:
\--\> 739 return structure.from_compatible_tensor_list(self.\_element_spec, ret)
740
741 @property
/opt/conda/lib/python3.7/contextlib.py in __exit__(self, type, value, traceback)
128 value = type()
129 try:
\--\> 130 self.gen.throw(type, value, traceback)
131 except StopIteration as exc:
132 # Suppress StopIteration *unless* it's the same exception that
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/context.py in execution_mode(mode)
2114 finally:
2115 ctx.executor = executor_old
\-\> 2116 executor_new.wait()
2117
2118
/opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/executor.py in wait(self)
67 def wait(self):
68 """Waits for ops dispatched in this executor to finish."""
\---\> 69 pywrap_tfe.TFE_ExecutorWaitForAllPendingNodes(self.\_handle)
70
71 def clear_error(self):
UnavailableError: failed to connect to all addresses
Additional GRPC error information from remote target /job:localhost/replica:0/task:0/device:CPU:0:
:{"created":"@1669489130.049337898","description":"Failed to pick subchannel","file":"third_party/grpc/src/core/ext/filters/client_channel/client_channel.cc","file_line":4143,"referenced_errors":\[{"created":"@1669489130.049336302","description":"failed to connect to all addresses","file":"third_party/grpc/src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.cc","file_line":398,"grpc_status":14}\]}
我就是不明白我做错了什么。
我的笔记本供参考:https://github.com/guerchen/my-own-facematch-liveness/blob/main/my-own-facematch-liveness.ipynb
我尝试明确地向所有张量添加尺寸并重塑图像。对于TPU,尝试遵循kaggle的指南:https://www.kaggle.com/code/philculliton/a-simple-tf-2-1-notebook/notebook#Load-my-data
你好
what is up
你好what is up
你好what is up
你好what is up
你好what is up