训练 xception 模型 keras - 批量大小 32 给出错误,但它适用于批量大小 = 16

问题描述 投票:0回答:1

训练 xception 模型 keras - 批量大小 32 给出错误,但它适用于批量大小 = 16
下面是错误日志的详细信息,你能帮我吗?我猜测下面的内容是错误的关键,但无法弄清楚 当通过分配器 GPU_0_bfc 在 /job:localhost/replica:0/task:0/device:GPU:0 上分配形状为 [728,728,1,1] 的张量并键入 float 时出现 OOM

   ResourceExhaustedError                    Traceback (most recent call last)
    Cell In[34], line 7
          2 model_save =  ModelCheckpoint('/kaggle/working/model_weights.keras' , monitor = 'val_loss', save_best_only = True, mode = 'min')
          3 reduce_lr =  ReduceLROnPlateau(monitor='val_loss', factor=0.1,
          4                               patience=4, min_lr=0.0001)
    ----> 7 history = model.fit(train_it, steps_per_epoch= steps_per_epoch, validation_data=val_it,
          8              validation_steps=validation_steps, epochs = epochs, callbacks=[early_stopping, model_save, reduce_lr] )
    
    File /opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
         67     filtered_tb = _process_traceback_frames(e.__traceback__)
         68     # To get the full stack trace, call:
         69     # `tf.debugging.disable_traceback_filtering()`
    ---> 70     raise e.with_traceback(filtered_tb) from None
         71 finally:
         72     del filtered_tb
    
    File /opt/conda/lib/python3.10/site-packages/tensorflow/python/eager/execute.py:52, in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
         50 try:
         51   ctx.ensure_initialized()
    ---> 52   tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
         53                                       inputs, attrs, num_outputs)
         54 except core._NotOkStatusException as e:
         55   if name is not None:
    
    ResourceExhaustedError: Graph execution error:
    
    Detected at node 'model_1/block6_sepconv2/separable_conv2d' defined at (most recent call last):
        File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
          return _run_code(code, main_globals, None,
        File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
          exec(code, run_globals)
        File "/opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
          app.launch_new_instance()
        File "/opt/conda/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
          app.start()
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 728, in start
          self.io_loop.start()
        File "/opt/conda/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 195, in start
          self.asyncio_loop.run_forever()
        File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
          self._run_once()
        File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
          handle._run()
        File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
          self._context.run(self._callback, *self._args)
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
          await self.process_one()
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 502, in process_one
          await dispatch(*args)
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
          await result
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
          reply_content = await reply_content
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
          res = shell.run_cell(
        File "/opt/conda/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
          return super().run_cell(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3009, in run_cell
          result = self._run_cell(
        File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell
          result = runner(coro)
        File "/opt/conda/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
          coro.send(None)
        File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async
          has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
        File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes
          if await self.run_code(code, result, async_=asy):
        File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
          exec(code_obj, self.user_global_ns, self.user_ns)
        File "/tmp/ipykernel_33/698136834.py", line 7, in <module>
          history = model.fit(train_it, steps_per_epoch= steps_per_epoch, validation_data=val_it,
        File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
          return fn(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1685, in fit
          tmp_logs = self.train_function(iterator)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function
          return step_function(self, iterator)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function
          outputs = model.distribute_strategy.run(run_step, args=(data,))
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step
          outputs = model.train_step(data)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1050, in train_step
          y_pred = self(x, training=True)
        File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
          return fn(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 558, in __call__
          return super().__call__(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
          return fn(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1145, in __call__
          outputs = call_fn(inputs, *args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
          return fn(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/functional.py", line 512, in call
          return self._run_internal_graph(inputs, training=training, mask=mask)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/functional.py", line 669, in _run_internal_graph
          outputs = node.layer(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
          return fn(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1145, in __call__
          outputs = call_fn(inputs, *args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
          return fn(*args, **kwargs)
        File "/opt/conda/lib/python3.10/site-packages/keras/layers/convolutional/separable_conv2d.py", line 188, in call
          outputs = tf.compat.v1.nn.separable_conv2d(
    Node: 'model_1/block6_sepconv2/separable_conv2d'
    OOM when allocating tensor with shape[728,728,1,1] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
         [[{{node model_1/block6_sepconv2/separable_conv2d}}]]
    Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
     [Op:__inference_train_function_39551]
tensorflow keras deep-learning
1个回答
0
投票

除了减少批量大小之外别无他法。

批量大小不能是2的幂,你可以尝试使用batch_size 24,从16逐渐增加,直到达到限制/充分利用GPU内存。

© www.soinside.com 2019 - 2024. All rights reserved.