tensorflow - Training google Object Detection API grpc error -
i following google's object detection api retraining on own dataset have encountered series of problems.
one of them following:
"traceback (most recent call last): file "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main "__main__", fname, loader, pkg_name) file "/usr/lib/python2.7/runpy.py", line 72, in _run_code exec code in run_globals file "/root/.local/lib/python2.7/site-packages/object_detection/train.py", line 198, in <module> tf.app.run() file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 44, in run _sys.exit(main(_sys.argv[:1] + flags_passthrough)) file "/root/.local/lib/python2.7/site-packages/object_detection/train.py", line 194, in main worker_job_name, is_chief, flags.train_dir) file "/root/.local/lib/python2.7/site-packages/object_detection/trainer.py", line 290, in train saver=saver) file "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/slim/python/slim/learning.py", line 776, in train master, start_standard_services=false, config=session_config) sess: file "/usr/lib/python2.7/contextlib.py", line 17, in __enter__ return self.gen.next() file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/supervisor.py", line 960, in managed_session self.stop(close_summary_writer=close_summary_writer) file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/supervisor.py", line 788, in stop stop_grace_period_secs=self._stop_grace_secs) file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/coordinator.py", line 386, in join six.reraise(*self._exc_info_to_raise) file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/supervisor.py", line 949, in managed_session start_standard_services=start_standard_services) file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/supervisor.py", line 713, in prepare_or_wait_for_session max_wait_secs=max_wait_secs) file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/session_manager.py", line 387, in wait_for_session is_ready, not_ready_msg = self._model_ready(sess) file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/session_manager.py", line 435, in _model_ready return _ready(self._ready_op, sess, "model not ready") file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/session_manager.py", line 492, in _ready ready_value = sess.run(op) file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 767, in run run_metadata_ptr) file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 965, in _run feed_dict_string, options, run_metadata) file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1015, in _do_run target_list, options, run_metadata) file "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1035, in _do_call raise type(e)(node_def, op, message) unavailableerror: {"created":"@1502405189.800982817","description":"eof","file":"external/grpc/src/core/lib/iomgr/tcp_posix.c","file_line":235,"grpc_status":14} " pathname: "/var/sitecustomize/sitecustomize.py" }
i not sure on grpc - quite @ standstill error. great!! thanks!!
this out-of-memory error (see this question).
you can try using larger machine type, particularly master, e.g. large_model
, complex_model_l
, or complex_model_l_gpu
. passing file --config
argument of gcloud
contents similar following:
traininginput: runtimeversion: "1.0" scaletier: custom mastertype: complex_model_l_gpu workercount: 9 workertype: standard_gpu parameterservercount: 3 parameterservertype: standard
Comments
Post a Comment